Using XPath with Scrapy

Question

I am new to using Scrapy and is trying get all the URLs of the listings on the page using Xpath.

The first xpath works

sel.xpath('//[contains(@class, "attraction_element")]')

but the second xpath is giving an error

get_parsed_string(snode_attraction, '//[@class="property_title"]/a/@href')

What is wrong and how can we fix it?

Scrapy Code

def clean_parsed_string(string):
    if len(string) > 0:
        ascii_string = string
        if is_ascii(ascii_string) == False:
            ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
        return str(ascii_string)
    else:
        return None


def get_parsed_string(selector, xpath):
    return_string = ''
    extracted_list = selector.xpath(xpath).extract()
    if len(extracted_list) > 0:
        raw_string = extracted_list[0].strip()
        if raw_string is not None:
            return_string = htmlparser.unescape(raw_string)
    return return_string


class TripAdvisorSpider(Spider):
    name = 'tripadvisor'

    allowed_domains = ["tripadvisor.com"]
    base_uri = "http://www.tripadvisor.com"
    start_urls = [
        base_uri + '/Attractions-g155032-Activities-c47-t163-Montreal_Quebec.html'
    ]


    # Entry point for BaseSpider
    def parse(self, response):

        tripadvisor_items = []

        sel = Selector(response)
        snode_attractions = sel.xpath('//[contains(@class, "attraction_element")]')

        # Build item index
        for snode_attraction in snode_attractions:
            print clean_parsed_string(get_parsed_string(snode_attraction, '//[@class="property_title"]/a/@href'))

Using XPath with Scrapy

Answers (1)

Related Questions