Mat
Mat

Reputation: 87

CSV file missing some data on write in python

I am creating a python program using scrapy that crawls a given domain and when it finds pdf's, it will scan them for information (location of pdf, num of pages, image count, field count, tagged, etc) and place all of this into a CSV file.

It downloads all the pdf's just fine, but when I open the csv file, only a fraction of the files downloaded are in the file. I'm not sure what I am doing wrong. I thought perhaps I was not properly closing the file once opened but I'm not sure that's the problem. Code is below:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount


class PdfspiderSpider(CrawlSpider):
    name = 'pdfspider' 
    allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
    allowed_domains = [allowed_domain]
    #need domain to name folder pdfs will be put into
    global domain
    domain = allowed_domains[0]
    global start
    start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
    start_urls = [start]
    global base_path
    base_path = input('Where do you wish to save the folder containing the pdfs?: ')

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        base_url = start        
        for a in response.xpath('//a[@href]/@href'):
            link = a.extract()
            if link.endswith('.pdf'):
                link = urlparse.urljoin(base_url, link)
                yield Request(link, callback=self.save_pdf)

    def create_csv(self):
        header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'x')
        writer = csv.writer(f)
        writer.writerow(header)
        f.close()


    def save_pdf(self, response):
        url=response.url
        if response.status == 200:
            save_dir = base_path + '/' + domain
            isExist = os.path.exists(save_dir)
            if not isExist:
                # Create a new directory because it does not exist 
                os.makedirs(save_dir)
            csvFile = domain + '.csv'
            csvPath = save_dir + '/' + csvFile
            csvPathExist = os.path.exists(csvPath)
            if not csvPathExist:
                self.create_csv()
            file = response.url.split('/')[-1]
            full_path = os.path.join(save_dir, file)
            with open(full_path, 'wb') as f:
                f.write(response.body)
                
                is_tagged = isTagged(full_path)
                metaData = get_data(full_path)
                fieldCount = getFieldCount(full_path)
                imageCount = count_images(full_path)
                row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
                self.add_to_csv(row)
                f.close()
            
        else:
            print(f"Failed to load pdf: {url}")

    def add_to_csv(self,row):
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'a', newline='')
        writer = csv.writer(f)
        writer.writerow(row)
        f.close()
        

So I think its the function "add_to_csv" thats the problem, but I can't figure out why. Any help woul be appreciated.

Upvotes: 0

Views: 270

Answers (1)

Amandeep Singh
Amandeep Singh

Reputation: 385

The issue is when you are calling

self.add_to_csv(row) method inside save_pdf() method

After calling this you are closing the file, which makes to write incomplete information to csv. What you can do is, put your code in try except clause and close all files in finally block.

Nothing wrong with the logic in add_to_csv() method.

Upvotes: 1

Related Questions