ashwin shanker
ashwin shanker

Reputation: 313

Scrapy not scraping all HTML tags

Im trying to use Scrapy to scrape information from a website.The general structure is as follows:

<item>
 <title>........</title>
 <link>.........</link>
 <category>......</category>
 <category>.......</category>
 <pubdate>.........</pubdate>
</item>

The website XML has 26 such items..I want to scrape the link,title categories and publication date for each item and store in a CSV file.My spider class is as follows:

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from testscraper.items import testscraperItem

class MySpider(BaseSpider):
     name="Test_scraper"
     allowed_domains=["http://nytimes.com/feed/"]
     start_urls=["http://nytimes.com/feed/"]

     def parse(self,response):
          data=[]
          hxs = HtmlXPathSelector(response)
          items= hxs.select('//item')

          for item in items:
              struct=testscraperItem()
              title=item.select('./title/text()').extract()
              link=item.select('./link/@href').extract()
              pubdate=item.select('./pubDate/text()').extract()
              topics=item.select('./category/text()').extract()

              struct["title"]=title
              struct["link"]=link
              struct["pubdate"]=pubdate
              struct["topics"]=topics
              data.append(struct)

          return data 

Everything works fine except for the Publication date tag which I’m not able to scrape out(I get a null value).A sample value of this tag would be:

<pubDate>Thu, 19 Feb 2015 19:29:08 GMT</pubDate>

I tried the folowing code using response.xpath and I AM able to extract the the pubdate tags:

def parse(self,response):
         items=[]
         pubdates=response.xpath('//item//pubDate/text()')
         for pubdate in pubdates:

            item["pubdate"]=pubdate.extract()

          return items

Why am I not able to extract the pubdate tag content when I loop over the items as opposed to being able to extract it when I take the entire webpage as a whole?Im really stumped and would love help on this.Thanks!!For other purposes I HAVE TO loop over each and every item so code snippet 2 is not an option -I have to follow the strcuture of the first code snippet that I have written

Upvotes: 0

Views: 1178

Answers (1)

alecxe
alecxe

Reputation: 473753

It looks suspiciously like an XML feed. If this is the case, you need to use XMLFeedSpider:

from scrapy import Item, Field
from scrapy.contrib.spiders import XMLFeedSpider

from testscraper.items import testscraperItem

class MySpider(XMLFeedSpider):
    name = "Test_scraper"
    itertag = 'item'

    allowed_domains = ["dealbook.nytimes.com"]
    start_urls = ["http://dealbook.nytimes.com/feed/"]

    def parse_nodes(self, response, nodes):
        for index, selector in enumerate(nodes, start=1):
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                result_item['index'] = index
                yield result_item

    def parse_node(self, response, selector):
        struct = testscraperItem()
        title = selector.select('./title/text()').extract()
        link = selector.select('./link/@href').extract()
        pubdate = selector.select('./pubDate/text()').extract()
        topics = selector.select('./category/text()').extract()

        struct["title"] = title
        struct["link"] = link
        struct["pubdate"] = pubdate
        struct["topics"] = topics
        yield struct

Output:

{'link': [],
 'pubdate': [u'Fri, 20 Feb 2015 18:02:28 GMT'],
 'title': [u'Currency\u2019s Weakness Troubles China\u2019s Policy Makers'],
 'topics': [u'China',
            u'Renminbi (Currency)',
            u'Economic Conditions and Trends',
            u"People's Bank of China",
            u'Xi Jinping']}
{'link': [],
 'pubdate': [u'Thu, 19 Feb 2015 15:58:15 GMT'],
 'title': [u'New Rules Spur a Humbling Overhaul of Wall St. Banks'],
 'topics': [u'Banking and Financial Institutions',
            u'Dodd-Frank Wall Street Reform and Consumer Protection Act (2010)',
            u'Executive Compensation',
            u'Regulation and Deregulation of Industry',
            u'Goldman Sachs Group Inc',
            u'JPMorgan Chase & Company',
            u'Federal Reserve System',
            u'Federal Deposit Insurance Corp']}
...

Upvotes: 1

Related Questions