Web scraping: loop through result pages AND through table rows

I appreciate all the questions and answers out there regarding Python/beautifulSoup/scraping, but I haven't seen much about this scenario and I'm stuck. Currently, my code can successfully loop through pages of a search result and create a csv doc, but when it comes to each individual table, it will only copy the first row before moving onto the next result page.

For example, this page. Currently, my output looks like this:

Brian Benoit,25-Jun-16,Conservative,12-May-16,25-Jun-16,Medicine Hat--Cardston--Warner,b'Medicine Hat--Cardston--Warner',Nikolai Punko

It should look like this instead:

Brian Benoit,25-Jun-16,Conservative,12-May-16,25-Jun-16,Medicine Hat--Cardston--Warner,b'Medicine Hat--Cardston--Warner',Nikolai Punko
Paul Hinman,25-Jun-16,Conservative,12-May-16,25-Jun-16,Medicine Hat--Cardston--Warner,b'Welling, Alberta',Robert B. Barfuss
Michael Jones,25-Jun-16,Conservative,12-May-16,25-Jun-16,Medicine Hat--Cardston--Warner,b'Raymond, Alberta',Dawn M. Hamon

(And so on for all of the rows in the table.)

My question is: how do I get it to loop through and scrape each row before continuing to the next results page? Thanks.

Here is my code:

from bs4 import BeautifulSoup
import requests
import re
import csv


url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"

with open('scrapeAllRows.csv', 'w', newline='') as f_output:
    csv_output = csv.writer(f_output)

    for i in range(1, 56):
        print(i)
        r  = requests.get(url.format(i))
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        links = []

        for link in soup.find_all('a', href=re.compile('selectedid=')):
            links.append("http://www.elections.ca" + link.get('href'))

        for link in links:
            r  = requests.get(link)
            data = r.text
            cat = BeautifulSoup(data, "html.parser")
            header = cat.find_all('span')
            tables = cat.find_all("table")[0].find_all("td")        

            row = [
                #"name": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
                #"date": 
                header[2].contents[0],
                #"party": 
                re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
                #"start_date": 
                header[3].contents[0],
                #"end_date": 
                header[5].contents[0],
                #"electoral district": 
                re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
                #"registered association": 
                re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
                #"elected": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
                #"address": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
                #"financial_agent": 
                re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()]


            csv_output.writerow(row)

Upvotes: 0

Answers (2)

Ionut Ticus

Reputation: 2789

I think you almost got it; you just have to find all tr elements in the table and loop over them:

from bs4 import BeautifulSoup
import requests
import re
import csv


url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"

with open('scrapeAllRows.csv', 'w', newline='') as f_output:
    csv_output = csv.writer(f_output)

    for i in range(1, 56):
        print(i)
        r  = requests.get(url.format(i))
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        links = []

        for link in soup.find_all('a', href=re.compile('selectedid=')):
            links.append("http://www.elections.ca" + link.get('href'))

        for link in links:
            r  = requests.get(link)
            data = r.text
            cat = BeautifulSoup(data, "html.parser")
            header = cat.find_all('span')
            table = cat.find("table")

            trs = table.find_all('tr')
            for tr in trs[1:]: #skip first row (table header)
                row = [
                    #"name": 
                    re.sub("[\n\r/]", "", tr.find("td", headers="name/1").contents[0]).strip(),
                    #"date": 
                    header[2].contents[0],
                    #"party": 
                    re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
                    #"start_date": 
                    header[3].contents[0],
                    #"end_date": 
                    header[5].contents[0],
                    #"electoral district": 
                    re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
                    #"registered association": 
                    re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
                    #"elected": 
                    re.sub("[\n\r/]", "", tr.find("td", headers="elected/1").contents[0]).strip(),
                    #"address": 
                    re.sub("[\n\r/]", "", tr.find("td", headers="address/1").contents[0]).strip(),
                    #"financial_agent": 
                    re.sub("[\n\r/]", "", tr.find("td", headers="fa/1").contents[0]).strip()
                ]

                csv_output.writerow(row)

Note the

trs = table.find_all('tr')
for tr in trs[1:]: #skip first row (table header)

I also used find instead of find_all("...")[0] because it is more readable IMO. You probably need a few try-catch blocks to make sure some elements exist, maybe define a new function to deal with the parsing part but other than that it should work OK.

Upvotes: 1

Mohammad Yusuf

Reputation: 17064

Partial solution to your problem. Everything you need is in the list.

from bs4 import BeautifulSoup
import requests

a = requests.get("http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&selectedid=8561").content
soup = BeautifulSoup(a)
c=[]
for b in [a.strip() for a in soup.find("fieldset").text.split('\n') if a]:
    if b:
        c.append(b)
print c

Output:

[u'June 25, 2016', u'/', u'Conservative', u'Nomination contest report submitted by the registered party', u'Nomination contest dates (start - end):', u'May 12, 2016', u'to', u'June 25, 2016', u'Electoral district:', u'Medicine Hat--Cardston--Warner', u'Registered association:', u'Contestants:', u'Name', u'Address', u'Financial Agent', u'Brian Benoit', u'Medicine Hat, Alberta', u'T1B 3C6', u'Nikolai Punko', u'Medicine Hat, Alberta', u'T1A 2V4', u'Paul Hinman', u'Welling, Alberta', u'T0K 2N0', u'Robert B. Barfuss', u'Cardston, Alberta', u'T0K 0K0', u'Michael Jones', u'Raymond, Alberta', u'T0K 2S0', u'Dawn M. Hamon', u'Raymond, Alberta', u'T0K 2S0', u'Glen Motz', u'Medicine Hat, Alberta', u'T1B 0A7', u'Milvia Bauman', u'Medicine Hat, Alberta', u'T1C 1S4', u'Gregory Ranger', u'Raymond, Alberta', u'T0K 2S0', u'Stephen G. Archibald', u'Raymond, Alberta', u'T0K 2S0', u'Joseph Schow', u'Redcliff, Alberta', u'T0J 2P2', u'Daniel Schow', u'Sherwood Park, Alberta', u'T8A 1C6', u'Indicates the contestant who won this nomination contest.', u'Top of page']

Upvotes: 0

Web scraping: loop through result pages AND through table rows

Answers (2)

Related Questions