Arjun Thakur
Arjun Thakur

Reputation: 655

Scrapy check if scraped url returning any downloadable file or not

I am new in Scrapy and didn't found any help so far.

I want to make a small scraper that can scrape all the url's on the page and then hit them one by one and if Url returns any down-loadable file of any extension then download it and save it into specified location. Here's the code that I have written : items.py

import scrapy

class ZcrawlerItem(scrapy.Item):
    file = scrapy.Field()
    file_url = scrapy.Field()

spider.py

from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request

DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem


class MycrawlerSpider(CrawlSpider):
    name = "mycrawler"
    allowed_domains = [DOMAIN]
    start_urls = [
        URL
    ]
    def parse_dir_contents(self, response):
        print(response.headers)
        item = CrawlerItem()
        item['file_url'] = response.url
        return item       

    def parse(self, response):
        hxs = Selector(response)
        for url in hxs.xpath('//a/@href').extract():
            if (url.startswith('http://') or url.startswith('https://')):
                yield Request(url, callback=self.parse_dir_contents)
        for url in hxs.xpath('//iframe/@src ').extract():
            yield Request(url, callback=self.parse_dir_contents)

The issues that I am facing are the parse_dir_contents not showing header, So it's become difficult to check whether the response data is any down-loadable file or just a content.

BTW I am using Scrapy 1.1.0 and Python 3.4

Any help would be really appreciated!!

Upvotes: 1

Views: 1632

Answers (1)

Arjun Thakur
Arjun Thakur

Reputation: 655

So after some R&D I found solution and here's the updated spider.py

from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request

DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem


class MycrawlerSpider(CrawlSpider):
    name = "mycrawler"
    allowed_domains = ''
    allowed_mime_type = [b'application/zip', b'application/x-msdownload', b'application/pdf', b'image/jpeg', b'image/jpg',
                     b'image/png', b'application/octet-stream']
    start_urls = [
        URL
    ]
    def parse(self, response):
        hxs = Selector(response)
        Urls = ''
        for url in hxs.xpath('//a/@href').extract():
            if (url.startswith('http://') or url.startswith('https://')):
                yield Request(url, callback=self.parse_item)
            elif 'javascript' not in url:
                new_url = urljoin(response.url, url.strip())
                print("New url : ", new_url)
                yield Request(new_url, callback=self.parse_item)
        for url in hxs.xpath('//iframe/@src ').extract():
            Urls += str(url) + ", "
            yield Request(url, callback=self.parse_item)

    def parse_item(self, response):
        if response.headers['Content-Type'] in self.allowed_mime_type:
            item = ZcrawlerItem()
            item['file_urls'] = response.url
            item['referer'] = str(response.request.headers['referer'].decode("utf-8"))
            yield item
        else:
            self.logger.info('Not found any allowed file type, lets try next page : %s', response.url)
            # self.process_next_url(response)
            yield Request(response.url, callback=self.parse, dont_filter=True)

And then output will be passed to pipeline.py where I am saving info in PostgreSql and downloading files

import json
import scrapy
import psycopg2
import datetime
import hashlib
import os

try:
    import urllib.request as urllib2
except ImportError:
    import urllib2

FILES_STORE = '<location to save files>'


class Pipeline(object):
    def __init__(self):
        self.conn = psycopg2.connect(user="postgres", password="pass",
                                     dbname="db_name",
                                     host='localhost')

    def process_item(self, item, spider):
        item['path'] = self.write_to_file(item['file_urls'])
        cur = self.conn.cursor()
        cur.execute('''
                insert into scrape ( file_url, referer,path,created_date)
                values (%s, %s,%s, %s);
                ''', [
            item['file_urls'],
            item['referer'],
            item['path'],
            datetime.datetime.now()])
        self.conn.commit()
        return item

    def write_to_file(self, url):
        response = urllib2.urlopen(url)
        directory = FILES_STORE + str(hashlib.md5(url.encode('utf-8')).hexdigest()) + "/"
        if not os.path.exists(directory):
            os.makedirs(directory)
        file_name = url.split('/')[-1]
        with open(directory + str(file_name), "wb") as handle:
            handle.write(response.read())
        return directory + str(file_name)

Hope this will help someone, cheers (y)

Upvotes: 1

Related Questions