Reputation: 63
This code is meant to loop through all of the results pages, and then loop through the results table on each of the pages and scrape all the data from the table as well as some information stored outside of the table.
However, the resultant CSV file does not appear to be in any rational organization, with each row having different categories of information in different columns. What I am after is for each row to contain all the categories of information defined (date, party, start date, end date, electoral district, registered association, whether or not the candidate was elected, name of candidate, address, and financial agent). Some of this data is stored in the table on each page, while the rest (the dates, party, district, registered association) is stored outside of the table and needs to be associated with each candidate in each table row on each page. Additionally, there does not seem to be any output for 'elected', 'address', or 'financial agent,' and I am not sure where I am going wrong.
I would really appreciate it if you could help me figure out how to fix my code in order to achieve this output. It is as follows:
from bs4 import BeautifulSoup
import requests
import re
import csv
url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"
rows = []
for i in range(1, 56):
print(i)
r = requests.get(url.format(i))
data = r.text
soup = BeautifulSoup(data, "html.parser")
links = []
for link in soup.find_all('a', href=re.compile('selectedid=')):
links.append("http://www.elections.ca" + link.get('href'))
for link in links:
r = requests.get(link)
data = r.text
cat = BeautifulSoup(data, "html.parser")
header = cat.find_all('span')
tables = cat.find_all("table")[0].find_all("td")
rows.append({
#"date":
header[2].contents[0],
#"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
#"start_date":
header[3].contents[0],
#"end_date":
header[5].contents[0],
#"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
#"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
#"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
#"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
#"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
#"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
})
with open('scrapeOutput.csv', 'w') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerows(rows)
Upvotes: 1
Views: 784
Reputation: 7100
If you want to crawl, you may want to have a look at CrawlSpider
, from scrapy
. I'm also using lxml.html
just because it offers more flexibility.
To install those libraries, you could use:
pip install scrapy
pip install lxml
To scaffold a basic scrapy project you can use the command:
scrapy startproject elections
Then add the spider and items:
elections/spiders/spider.py
from scrapy.spiders import CrawlSpider, Rule
from elections.items import ElectionsItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from lxml import html
class ElectionsSpider(CrawlSpider):
name = "elections"
allowed_domains = ["elections.ca"]
start_urls = ["http://www.elections.ca/WPAPPS/WPR/EN/NC/Details?province=-1&distyear=2013&district=-1&party=-1&pageno=1&totalpages=55&totalcount=1372&viewall=1"]
rules = (
Rule(LxmlLinkExtractor(
allow = ('http://www.elections.ca/WPAPPS/WPR/EN/NC/Details.*'),
),
callback='parse_item',
follow=True
),
)
def unindent(self, string):
return ''.join(map(str.strip, string.encode('utf8').splitlines(1)))
def parse_item(self, response):
item = ElectionsItem()
original_html = Selector(response).extract()
lxml_obj = html.fromstring(original_html)
for entry in lxml_obj.xpath('.//fieldset[contains(@class,"wpr-detailgroup")]'):
date = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]/span[contains(@class,"date")]')
if date:
item['date'] = self.unindent(date[0].text.strip())
party = entry.xpath('.//legend[contains(@class,"wpr-ltitle")]')
if party:
item['party'] = self.unindent(party[0].text.strip())
start_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][1]')
if start_date:
item['start_date'] = self.unindent(start_date[0].text.strip())
end_date = entry.xpath('.//div[contains(@class,"group")]/span[contains(@class,"date")][2]')
if end_date:
item['end_date'] = self.unindent(end_date[0].text.strip())
electoral_district = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Electoral district:")]')
if electoral_district:
item['electoral_district'] = self.unindent(electoral_district[0].tail.strip())
registered_association = entry.xpath('.//div[contains(@class,"wpr-title")][contains(text(),"Registered association:")]')
if registered_association:
item['registered_association'] = self.unindent(registered_association[0].tail.strip())
for candidate in entry.xpath('.//table[contains(@class, "wpr-datatable")]//tr[not(@class)]'):
item['elected'] = len(candidate.xpath('.//img[contains(@alt, "contestant won this nomination contest")]'))
candidate_name = candidate.xpath('.//td[contains(@headers,"name")]')
if candidate_name:
item['candidate_name'] = self.unindent(candidate_name[0].text.strip())
item['address'] = self.unindent(candidate.xpath('.//td[contains(@headers,"address")]')[0].text_content().strip())
item['financial_agent'] = self.unindent(candidate.xpath('.//td[contains(@headers,"fa")]')[0].text_content().strip())
yield item
elections/items.py
from scrapy.item import Item, Field
class ElectionsItem(Item):
date = Field()
party = Field()
start_date = Field()
end_date = Field()
electoral_district = Field()
registered_association = Field()
elected = Field()
candidate_name = Field()
address = Field()
financial_agent = Field()
elections/settings.py
BOT_NAME = 'elections'
SPIDER_MODULES = ['elections.spiders']
NEWSPIDER_MODULE = 'elections.spiders'
ITEM_PIPELINES = {
'elections.pipelines.ElectionsPipeline': 300,
}
elections/pipelines.py
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exporters import CsvItemExporter
class electionsPipeline(object):
def __init__(self):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.files = {}
def spider_opened(self, spider):
file = open('%s_ads.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
You can run the spider by running the command:
scrapy runspider elections/spiders/spider.py
From the root of your project.
It should create a elections.csv
in the root of your project, like this:
financial_agent,end_date,candidate_name,registered_association,electoral_district,elected,address,date,party,start_date
"Jan BalcaThornhill, OntarioL4J 1V9","September 09, 2015",Leslyn Lewis,,Scarborough--Rouge Park,1,"Markham, OntarioL6B 0K9","September 09, 2015",,"September 07, 2015"
"Mark HicksPasadena, Newfoundland and LabradorA0L 1K0","September 08, 2015",Roy Whalen,,Long Range Mountains,1,"Deer Lake, Newfoundland and LabradorA8A 3H6","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Wayne Ruth,,Long Range Mountains,0,"Kippens, Newfoundland and LabradorA2N 3B8","September 08, 2015",,"August 21, 2015"
,"September 08, 2015",Mark Krol,,St. John's South--Mount Pearl,1,"Woodbridge, OntarioL4L 1Y5","September 08, 2015",,"August 24, 2015"
,"September 08, 2015",William MacDonald Alexander,,Bow River,1,"Calgary, AlbertaT2V 0M1","September 08, 2015",,"September 04, 2015"
(...)
Upvotes: 1
Reputation: 46779
I would suggest you write to your output CSV file a row at a time as you go along, rather than waiting to the very end. Also, it is better to use lists rather than dictionaries to hold your data. This way the ordering of the columns is maintained.
from bs4 import BeautifulSoup
import requests
import re
import csv
url = "http://www.elections.ca/WPAPPS/WPR/EN/NC?province=-1&distyear=2013&district=-1&party=-1&pageno={}&totalpages=55&totalcount=1368&secondaryaction=prev25"
with open('scrapeOutput.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
for i in range(1, 56):
print(i)
r = requests.get(url.format(i))
data = r.text
soup = BeautifulSoup(data, "html.parser")
links = []
for link in soup.find_all('a', href=re.compile('selectedid=')):
links.append("http://www.elections.ca" + link.get('href'))
for link in links:
r = requests.get(link)
data = r.text
cat = BeautifulSoup(data, "html.parser")
header = cat.find_all('span')
tables = cat.find_all("table")[0].find_all("td")
row = [
#"date":
header[2].contents[0],
#"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
#"start_date":
header[3].contents[0],
#"end_date":
header[5].contents[0],
#"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
#"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
#"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
#"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
#"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
#"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()]
csv_output.writerow(row)
print(row)
This would result in a CSV starting as follows:
"December 08, 2016",Green Party,"September 21, 2016","December 08, 2016",Calgary Midnapore,b'Calgary Midnapore',,Ryan Zedic,,
"November 29, 2016",NDP-New Democratic Party,"August 24, 2016","November 29, 2016",Ottawa--Vanier,b'Ottawa--Vanier',,Emilie Taman,,
"September 28, 2016",Green Party,"September 04, 2016","September 28, 2016",Medicine Hat--Cardston--Warner,b'Medicine Hat--Cardston--Warner',,Kelly Dawson,,
Upvotes: 1
Reputation: 535
I think your dictionaries are a bit of a mess, you don't assign keys. Remind that if you convert a dictionary into a list, python will sort them by alphabetical order according to the key. But with the csv library, you can print a csv easily without all of those operations.
So assign keys :
rows.append({
"date":
header[2].contents[0],
"party":
re.sub("[\n\r/]", "", cat.find("legend").contents[2]).strip(),
"start_date":
header[3].contents[0],
"end_date":
header[5].contents[0],
"electoral district":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip(),
"registered association":
re.sub("[\n\r/]", "", cat.find_all('div', class_="group")[2].contents[2]).strip().encode('latin-1'),
"elected":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="elected/1")[0].contents[0]).strip(),
"name":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="name/1")[0].contents[0]).strip(),
"address":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="address/1")[0].contents[0]).strip(),
"financial_agent":
re.sub("[\n\r/]", "", cat.find_all("table")[0].find_all("td", headers="fa/1")[0].contents[0]).strip()
})
And then write your csv with DictWriter :
with open('scrapeOutput.csv', 'w') as f_output:
csv_output = csv.DictWriter(f_output, rows[0].keys())
csv_output.writeheader() # Write header to understand the csv
csv_output.writerows(rows)
I tested this and it's working, but be careful some of your fields like address or elected are empty :)
See ya !
Upvotes: 1