Running scrapy spider but blank output. python

Question

I am trying to have this spider go through a list of 1600 urls contained in a csv and pull emails and phone numbers from the page. If anyone already has such a program I'd be happy to use it, but also i would love to know where I went wrong. Here is my code, I have passed it through chat gpt to tighten it up and annotate it.

import scrapy
import pandas as pd
import os
import re
import logging


class Spider(scrapy.Spider):
    name = 'business_scrape'

    def extract_emails(self, text):
        # Extract email addresses using a comprehensive regex pattern
        emails = re.findall(
            r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
        return emails

    def extract_phone(self, text):
        # Extract phone numbers
        phone_numbers = re.findall(
            r'(?:(?:\+\d{1,2}\s?)?$?\d{3}$?[-.\s]?)?\d{3,4}[-.\s]?\d{4}', text)
        return phone_numbers

    def start_requests(self):
        # Read the initial CSV file with columns [name, url, category]
        csv = 'bozeman_businesses.csv'  # Specify your CSV file
        init_df = pd.read_csv(csv)

        for _, row in init_df.iterrows():
            name = row['name']
            url = row['url']
            category = row['category']

            yield scrapy.Request(url=url, callback=self.parse_link, meta={'name': name, 'category': category})

    def parse_link(self, response):
        name = response.meta['name']
        category = response.meta['category']

        # Initialize logging
        logging.basicConfig(
            filename='scrapy.log', format='%(levelname)s: %(message)s', level=logging.INFO)

        # Log the start of crawling
        logging.info('Crawling started.')
        for word in self.reject:
            if word in str(response.url):
                return

        html_text = str(response.text)
        try:
            # Extract email addresses using the function
            mail_list = self.extract_emails(html_text)

            # Extract phone numbers using the function
            phone_numbers = self.extract_phone(html_text)

            # Ensure 'email' and 'phone' lists have the same length
            min_length = min(len(mail_list), len(phone_numbers))
            mail_list = mail_list[:min_length]
            phone_numbers = phone_numbers[:min_length]

            dic = {'name': [name], 'category': [category], 'email': mail_list,
                   'phone': phone_numbers, 'url': [str(response.url)]}

        except Exception as e:
            # Handle the failure by setting "NA" values
            self.logger.error(f'Error scraping {response.url}: {e}')
            dic = {'name': [name], 'category': [category], 'email': ['NA'],
                   'phone': ['NA'], 'url': [str(response.url)]}

        # Check if the output file exists and prompt the user if it does
        if os.path.exists(self.path):
            response = self.ask_user('File already exists, replace?')
            if response is False:
                return

        # Create or overwrite the output file
        self.create_or_overwrite_file(self.path)

        # Append the data to the output CSV file
        df = pd.DataFrame(dic)
        df.to_csv(self.path, mode='a', header=False, index=False)

    # Define the reject list and output file path
    reject = ['example.com', 'example2.com']  # Adjust as needed
    path = 'output.csv'  # Adjust the output file path as needed

    def ask_user(self, question):
        response = input(question + ' y/n' + '\n')
        return response.lower() == 'y'

    def create_or_overwrite_file(self, path):
        response = False
        if os.path.exists(path):
            response = self.ask_user('File already exists, replace?')
            if response is False:
                return

        with open(path, 'wb') as file:
            file.close()

My log is pretty long so here are some excerpts:

2023-09-21 15:51:02 [scrapy.core.engine] DEBUG: Crawled (200)  (referer: None)
2023-09-21 15:51:02 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 http://www.gallatinvalleytaxservices.com>: HTTP status code is not handled or not allowed
2023-09-21 15:51:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (308) to  from 
2023-09-21 15:51:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to  from 
2023-09-21 15:51:03 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to  from 
2023-09-21 15:51:03 [scrapy.core.engine] DEBUG: Crawled (200)  (referer: None)
2023-09-21 15:51:03 [root] INFO: Crawling started.

Seems good so far

file "/Users/me/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 502, in dict_to_mgr
    return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
  File "/Users/me/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 120, in arrays_to_mgr
    index = _extract_index(arrays)
  File "/Users/me/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 674, in _extract_index
    raise ValueError("All arrays must be of the same length"

I feel like this error about array lengths is the problem. I tried adding NA values if the process failed. Didn't seem to work :(

This error also popped up.

2023-09-21 15:52:03 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying  (failed 3 times): []
2023-09-21 15:52:03 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading : []

The rest of the log is essentially repeated.

What I did and expected:

I tried adding try, except loops to skip problem sites and put NA values in.
I expected that even if I could not scrape the info i wanted I would still have a csv with the business name, url, and NA values
I'd also be interested in how I can better debug this myself

Running scrapy spider but blank output. python

Answers (1)

Related Questions