Patrick Kenneally
Patrick Kenneally

Reputation: 53

Nested for loop keeps repeating

I have a python scraper main purpose

Read list of postcodes from text to an array

for each postcode in array search 10 pages pull out certain content.

i seem to be getting the results like: page 1 page 2 page 2 page 3 page 3 page 3 page 4 page 4 page 4 page 4

etc

i have tried re arranging the code several times without much look, everything works fine expcept this step


from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv

print(" Initializing ...")
print(" Loading Keywords")
with open("pcodes.txt") as pcodes:
    postkeys = []
    for line in pcodes:
        postkeys.append(line.strip())

with open("pcodnum.txt") as pcodnum:
    postkeynum = []
    for line in pcodnum:
        postkeynum.append(line.strip())

print(" Welcome to YellScrape v1.0")
print(" You ar searching yell.com ")

comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open(datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }

data_list = []
for x in postkeys:
    print(" Searching " + x + " for " + comtype + " companies")
    for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)
        for item in data_list:
            site = requests.get(item, headers=headers)
            soup = BeautifulSoup(site.content, 'html.parser')
            questions = soup.select('.businessCapsule--mainContent')
            for question in questions:
                listinnum += 1
                busname = question.find(class_='businessCapsule--name').get_text()
                bustype =   question.find(class_='businessCapsule--classification').get_text()
                busnumber = question.select_one('span.business--telephoneNumber')
                if busnumber is None:
                    busnumber = 'None'
                else:
                    busnumber = busnumber.text
                busadd = question.find('span', attrs={"itemprop": "streetAddress"})
                if busadd is None:
                    busadd = 'None'
                else:
                    busadd = busadd.text.replace(',',' ')
                buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
                if buslocal is None:
                    buslocal = 'None'
                else:
                    buslocal = buslocal.text
                buspost = question.find('span', attrs={"itemprop": "postalCode"})
                if buspost is None:
                    buspost = 'None'
                else:
                    buspost = buspost.text
                busweb = question.find('a', attrs={"rel": "nofollow noopener"})
                if busweb is None:
                    busweb = 'None'
                else:
                    busweb = busweb.attrs['href']
                print(busweb)
                f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])


        pagesnum += 1
        print(" Finsihed Page " + str(y) + ". For " + x + " . " + str(listinnum) + " listings so far. Moving To Next Page")
    print(" Waiting 30 seconds for security reasons.")
    sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

Expected result:

finished page 1 finished page 2 finished page 3

etc

Upvotes: 0

Views: 1234

Answers (2)

bharatk
bharatk

Reputation: 4315

Initialize pageNum inside for loop:

for x in postkeys:
   pageNum = 1

Increment pageNum side for loop and format URL

for item in data_list:
    #format website url
    url = "https://www.yell.com/ucs/UcsSearchAction.do?keywords={}&pageNum={}&location={}".format(comtype, pageNum, x)
    site = requests.get(url, headers=headers)

    # check response status code:
    if site.status_code != 200:
        break

    pageNum += 1

You should remove this for loop:

for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

Upvotes: 0

chitown88
chitown88

Reputation: 28565

It's because you are appending to your data list and then using a for loop to iterate through it after each time it's appending a new link.

So it's going to do requests for page 1, then requests for page 1 and requests for page 2, then page 1, 2 and 3, then page 1, 2, 3, and 4... etc.

So there's 2 ways to fix that. 1) don't append to the data_list and eliminate that all together, or 2) you can append to the data_list FIRST, and then loop through it (so separate the loop where you append to data_list and iterate through data_list.

I choose option 2)

from bs4 import BeautifulSoup
import time
from time import sleep
from datetime import datetime
import requests
import csv

print(" Initializing ...")
print(" Loading Keywords")
with open("C:/pcodes.txt") as pcodes:
    postkeys = []
    for line in pcodes:
        postkeys.append(line.strip())

with open("C:/pcodnum.txt") as pcodnum:
    postkeynum = []
    for line in pcodnum:
        postkeynum.append(line.strip())

print(" Welcome to YellScrape v1.0")
print(" You are searching yell.com ")

comtype = input(" Please enter a Company Type (e.g Newsagent, Barber): ")
pagesnum = 0
listinnum = 0
comloc = " "
f = csv.writer(open('C:/'+datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv', 'w'))
f.writerow(['Business Name', 'Business Type', 'Phone Number', 'Street Address', 'Locality', 'Region', 'Website'])

headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }

data_list = []
for x in postkeys:
    print(" Searching " + x + " for " + comtype + " companies")
    for y in postkeynum:
        url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=' + comtype + '&pageNum=' + str(y) + '&location=' + x
        data_list.append(url)

    # Now that you created a list of the urls, now you can loop through them


    for item in data_list:

        page = item.split('pageNum=')[-1].split('&')[0]
        location = item[-5:]

        site = requests.get(item, headers=headers)
        soup = BeautifulSoup(site.content, 'html.parser')
        questions = soup.select('.businessCapsule--mainContent')
        for question in questions:
            listinnum += 1
            busname = question.find(class_='businessCapsule--name').get_text()
            bustype =   question.find(class_='businessCapsule--classification').get_text()
            busnumber = question.select_one('span.business--telephoneNumber')
            if busnumber is None:
                busnumber = 'None'
            else:
                busnumber = busnumber.text
            busadd = question.find('span', attrs={"itemprop": "streetAddress"})
            if busadd is None:
                busadd = 'None'
            else:
                busadd = busadd.text.replace(',',' ')
            buslocal = question.find('span', attrs={"itemprop": "addressLocality"})
            if buslocal is None:
                buslocal = 'None'
            else:
                buslocal = buslocal.text
            buspost = question.find('span', attrs={"itemprop": "postalCode"})
            if buspost is None:
                buspost = 'None'
            else:
                buspost = buspost.text
            busweb = question.find('a', attrs={"rel": "nofollow noopener"})
            if busweb is None:
                busweb = 'None'
            else:
                busweb = busweb.attrs['href']
            print(busweb)
            f.writerow([busname, bustype, busnumber, busadd, buslocal, buspost, busweb])


        pagesnum += 1
        print(" Finished Page " + page + ". For " + location + " . " + str(listinnum) + " listings so far. Moving To Next Page")


    if item != data_list[-1]:
        print(" Waiting 30 seconds for security reasons.")
        sleep(30)
print(" Finished. \n Total: " + str(pagesnum) + " pages with " + str(listinnum) + " listings. \n Please look for file: " + datetime.today().strftime('%Y-%m-%d') + '-' + comtype + '-' + 'yelldata.csv')

Upvotes: 1

Related Questions