Reputation: 87
I am creating a python program using scrapy that crawls a given domain and when it finds pdf's, it will scan them for information (location of pdf, num of pages, image count, field count, tagged, etc) and place all of this into a CSV file.
It downloads all the pdf's just fine, but when I open the csv file, only a fraction of the files downloaded are in the file. I'm not sure what I am doing wrong. I thought perhaps I was not properly closing the file once opened but I'm not sure that's the problem. Code is below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[@href]/@href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
is_tagged = isTagged(full_path)
metaData = get_data(full_path)
fieldCount = getFieldCount(full_path)
imageCount = count_images(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
self.add_to_csv(row)
f.close()
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'a', newline='')
writer = csv.writer(f)
writer.writerow(row)
f.close()
So I think its the function "add_to_csv" thats the problem, but I can't figure out why. Any help woul be appreciated.
Upvotes: 0
Views: 270
Reputation: 385
The issue is when you are calling
self.add_to_csv(row) method inside save_pdf() method
After calling this you are closing the file, which makes to write incomplete information to csv. What you can do is, put your code in try except clause and close all files in finally block.
Nothing wrong with the logic in add_to_csv() method.
Upvotes: 1