Reputation: 31
I made a scraper for yellow pages in python. There is a table with working hours of the businesses listed. I scrape that into a list and save it in a csv using scrapy. These different items are seperated by a comma by default like
Mon,Closed,Tue - Fri ,9 00 am - 6:00 pm,Sat ,9 00 am - 1:00 pm,Sun,Closed
I want to use a pipe (|) instead of commas. So the final list be like this: Mon,Closed| Tue - Fri ,9 00 am - 6:00 pm|Sat ,9 00 am - 1:00 pm|Sun,Closed
Any help on how should i implement this would be appreciated.Following is my parse method:
def parse_item(self, response):
item = YellowItem()
item['keyword'] = category_i
item['title'] = response.xpath('//h1/text()').extract_first()
item['phone'] = response.xpath('//p[@class="phone"]/text()').extract_first()
addr = response.xpath('//h2[@class="address"]/text()').extract_first()
item['street_address'] = addr
email = response.xpath('//a[@class="email-business"]/@href').extract_first()
try:
item['email'] = email.replace("mailto:", '')
except AttributeError:
pass
item['website'] = response.xpath('//a[@class="primary-btn website-link"]/@href').extract_first()
item['Description'] = response.xpath('//dd[@class="general-info"]/text()').extract_first()
hours = response.xpath(
'//div[@class="open-details"]/descendant-or-self::*/text()[not(ancestor::*['
'@class="hour-category"])]').extract()
t_f_h = []
for hour in hours:
data = re.findall(r'(\d{1,2}:\d{2})\s(AM|PM|am|pm)', hour)
if data:
time = data[0][0] + " " + data[0][1]
time_t = data[1][0] + " " + data[1][1]
d = time
t = pd.to_datetime(d).strftime('%H:%M')
start = t
d_t = time_t
time_d = pd.to_datetime(d_t).strftime('%H:%M')
end = time_d
fin_t = hour.replace(time, start)
m_f_t = fin_t.replace(time_t, end)
t_f_h.append(m_f_t)
if not data:
t_f_h.append(hour)
item['t_hour_format'] = t_f_h
try:
clean_l = []
for hour in hours:
clean_st = hour.replace(":", " ", 1)
clean_l.append(clean_st)
item['Hours'] = clean_l
except AttributeError:
pass
item['Other_info'] = response.xpath(
'//dd[@class="other-information"]/descendant-or-self::*/text()').extract()
category_ha = response.xpath('//dd[@class="categories"]/descendant-or-self::*/text()').extract()
item['Categories'] = " ".join(category_ha)
item['Years_in_business'] = response.xpath('//div[@class="number"]/text()').extract_first()
year = item['Years_in_business']
if year:
opened = 2020 - int(year) # change the year here
item['year_opened'] = 'Year Opened: ' + str(opened)
neighborhood = response.xpath('//dd[@class="neighborhoods"]/descendant-or-self::*/text()').extract()
item['neighborhoods'] = ' '.join(neighborhood)
item['other_links'] = response.xpath('//dd[@class="weblinks"]/descendant-or-self::*/text()').extract()
item['BBB_Grade'] = response.xpath('//span[@class="bbb-no-link"]/text()').extract_first()
item['link_to_the_listing'] = response.url
adress = str(addr)
data = usaddress.tag(adress)
if "PlaceName" in data[0].keys():
item["City"] = data[0]["PlaceName"]
if "StateName" in data[0].keys():
item["State"] = data[0]["StateName"]
if "ZipCode" in data[0].keys():
item["Zip"] = data[0]["ZipCode"]
return item
Upvotes: 0
Views: 154
Reputation: 1201
You could write your own exporter based on CsvItemExporter.
from scrapy.exporters import CsvItemExporter
class MyExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
kwargs['delimiter'] = '|'
super(MyExporter, self).__init__(*args, **kwargs)
You can then set your new ItemExporter in your project's settings.py.
FEED_EXPORTERS = {
'csv': 'my_project.file_containing_exporter.MyExporter'
}
Upvotes: 2