CSV file missing some data on write in python

Question

I am creating a python program using scrapy that crawls a given domain and when it finds pdf's, it will scan them for information (location of pdf, num of pages, image count, field count, tagged, etc) and place all of this into a CSV file.

It downloads all the pdf's just fine, but when I open the csv file, only a fraction of the files downloaded are in the file. I'm not sure what I am doing wrong. I thought perhaps I was not properly closing the file once opened but I'm not sure that's the problem. Code is below:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount


class PdfspiderSpider(CrawlSpider):
    name = 'pdfspider' 
    allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
    allowed_domains = [allowed_domain]
    #need domain to name folder pdfs will be put into
    global domain
    domain = allowed_domains[0]
    global start
    start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
    start_urls = [start]
    global base_path
    base_path = input('Where do you wish to save the folder containing the pdfs?: ')

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        base_url = start        
        for a in response.xpath('//a[@href]/@href'):
            link = a.extract()
            if link.endswith('.pdf'):
                link = urlparse.urljoin(base_url, link)
                yield Request(link, callback=self.save_pdf)

    def create_csv(self):
        header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'x')
        writer = csv.writer(f)
        writer.writerow(header)
        f.close()


    def save_pdf(self, response):
        url=response.url
        if response.status == 200:
            save_dir = base_path + '/' + domain
            isExist = os.path.exists(save_dir)
            if not isExist:
                # Create a new directory because it does not exist 
                os.makedirs(save_dir)
            csvFile = domain + '.csv'
            csvPath = save_dir + '/' + csvFile
            csvPathExist = os.path.exists(csvPath)
            if not csvPathExist:
                self.create_csv()
            file = response.url.split('/')[-1]
            full_path = os.path.join(save_dir, file)
            with open(full_path, 'wb') as f:
                f.write(response.body)
                
                is_tagged = isTagged(full_path)
                metaData = get_data(full_path)
                fieldCount = getFieldCount(full_path)
                imageCount = count_images(full_path)
                row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
                self.add_to_csv(row)
                f.close()
            
        else:
            print(f"Failed to load pdf: {url}")

    def add_to_csv(self,row):
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'a', newline='')
        writer = csv.writer(f)
        writer.writerow(row)
        f.close()

So I think its the function "add_to_csv" thats the problem, but I can't figure out why. Any help woul be appreciated.

CSV file missing some data on write in python

Answers (1)

Related Questions