rex sphinx
rex sphinx

Reputation: 31

Separate column items by pipe instead of comma

I made a scraper for yellow pages in python. There is a table with working hours of the businesses listed. I scrape that into a list and save it in a csv using scrapy. These different items are seperated by a comma by default like

Mon,Closed,Tue - Fri ,9 00 am - 6:00 pm,Sat ,9 00 am - 1:00 pm,Sun,Closed

I want to use a pipe (|) instead of commas. So the final list be like this: Mon,Closed| Tue - Fri ,9 00 am - 6:00 pm|Sat ,9 00 am - 1:00 pm|Sun,Closed Any help on how should i implement this would be appreciated.Following is my parse method:

def parse_item(self, response):

    item = YellowItem()
    item['keyword'] = category_i
    item['title'] = response.xpath('//h1/text()').extract_first()
    item['phone'] = response.xpath('//p[@class="phone"]/text()').extract_first()
    addr = response.xpath('//h2[@class="address"]/text()').extract_first()
    item['street_address'] = addr

    email = response.xpath('//a[@class="email-business"]/@href').extract_first()
    try:
        item['email'] = email.replace("mailto:", '')
    except AttributeError:
        pass
    item['website'] = response.xpath('//a[@class="primary-btn website-link"]/@href').extract_first()
    item['Description'] = response.xpath('//dd[@class="general-info"]/text()').extract_first()
    hours = response.xpath(
        '//div[@class="open-details"]/descendant-or-self::*/text()[not(ancestor::*['
        '@class="hour-category"])]').extract()

    t_f_h = []

    for hour in hours:
        data = re.findall(r'(\d{1,2}:\d{2})\s(AM|PM|am|pm)', hour)
        if data:
            time = data[0][0] + " " + data[0][1]
            time_t = data[1][0] + " " + data[1][1]

            d = time
            t = pd.to_datetime(d).strftime('%H:%M')
            start = t

            d_t = time_t
            time_d = pd.to_datetime(d_t).strftime('%H:%M')
            end = time_d

            fin_t = hour.replace(time, start)
            m_f_t = fin_t.replace(time_t, end)

            t_f_h.append(m_f_t)
        if not data:
            t_f_h.append(hour)

    item['t_hour_format'] = t_f_h

    try:
        clean_l = []
        for hour in hours:
            clean_st = hour.replace(":", " ", 1)

            clean_l.append(clean_st)
        item['Hours'] = clean_l
    except AttributeError:
        pass
    item['Other_info'] = response.xpath(
        '//dd[@class="other-information"]/descendant-or-self::*/text()').extract()
    category_ha = response.xpath('//dd[@class="categories"]/descendant-or-self::*/text()').extract()
    item['Categories'] = " ".join(category_ha)
    item['Years_in_business'] = response.xpath('//div[@class="number"]/text()').extract_first()
    year = item['Years_in_business']
    if year:
        opened = 2020 - int(year)  # change the year here
        item['year_opened'] = 'Year Opened: ' + str(opened)
    neighborhood = response.xpath('//dd[@class="neighborhoods"]/descendant-or-self::*/text()').extract()
    item['neighborhoods'] = ' '.join(neighborhood)
    item['other_links'] = response.xpath('//dd[@class="weblinks"]/descendant-or-self::*/text()').extract()
    item['BBB_Grade'] = response.xpath('//span[@class="bbb-no-link"]/text()').extract_first()
    item['link_to_the_listing'] = response.url
    adress = str(addr)
    data = usaddress.tag(adress)
    if "PlaceName" in data[0].keys():
        item["City"] = data[0]["PlaceName"]

    if "StateName" in data[0].keys():
        item["State"] = data[0]["StateName"]

    if "ZipCode" in data[0].keys():
        item["Zip"] = data[0]["ZipCode"]

    return item

Upvotes: 0

Views: 154

Answers (1)

Patrick Klein
Patrick Klein

Reputation: 1201

You could write your own exporter based on CsvItemExporter.

from scrapy.exporters import CsvItemExporter

class MyExporter(CsvItemExporter):
    def __init__(self, *args, **kwargs):
        kwargs['delimiter'] = '|'
        super(MyExporter, self).__init__(*args, **kwargs)

You can then set your new ItemExporter in your project's settings.py.

FEED_EXPORTERS = {
    'csv': 'my_project.file_containing_exporter.MyExporter'
}

Upvotes: 2

Related Questions