invisibleufo101
invisibleufo101

Reputation: 45

How to run Scrapy in a while loop

So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

But I get an error saying reactor is not restartable.

Traceback (most recent call last):
  File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
    process.start()
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
    self.startRunning(installSignalHandlers=installSignalHandlers)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
    ReactorBase.startRunning(cast(ReactorBase, self))
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
    raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable

So I guess using while loop is no-go. I don't know where to even start...

Upvotes: 4

Views: 1900

Answers (4)

JTraa
JTraa

Reputation: 21

You can run spiders in a loop by installing the reactor at the top level before other scrapy or reactor imports, then deleting the reactor after each crawl. This worked for me:

main.py

import time
from spider_utils import run_crawler

while 1:
    run_crawler('spider1')
    run_crawler('spider2')
    time.sleep(60)

spider_utils.py

from scrapy.utils.reactor import install_reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def run_crawler(spider_name: str):
    """Run isolated spider and restart reactor to run another spider afterwards."""
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider_name)
    process.start()

    import sys
    del sys.modules['twisted.internet.reactor']

Upvotes: 2

Ataime Benson
Ataime Benson

Reputation: 1

from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main

configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)    
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until all crawling jobs are finished

del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()

Upvotes: 0

furas
furas

Reputation: 142651

Method 1:

scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor.

import multiprocessing

def run_crawler(keyword, page_range):
   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

# --- main ---

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   p = multiprocessing(target=run_crawler, args=(keyword, page_range))
   p.start()
   p.join()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.


Minimal working code (tested on Linux).

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

import multiprocessing
from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page_range):
    #from scrapy.crawler import CrawlerProcess

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:

        p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
        p.start()
        p.join()
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False

Method 2:

Found in Google: Restarting a Twisted Reactor.

It is old post which uses del to remove module twisted from memory and later it imports it again.

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False
           
   import sys
   del sys.modules['twisted.internet.reactor']
   from twisted.internet import reactor
   from twisted.internet import default
   default.install()                  

Minimal working code (tested on Linux)

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page):

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:
    
        run_crawler(keyword, page)
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False
            
        import sys
        del sys.modules['twisted.internet.reactor']
        from twisted.internet import reactor
        from twisted.internet import default
        default.install()            

Method 3:

It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.

Base on last example in doc for Running multiple spiders in the same process I created code which runs while-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:

        yield runner.crawl(MySpider, keyword, page)
        yield runner.crawl(MySpider, keyword, int(page)+1)
        yield runner.crawl(MySpider, keyword, int(page)+2)

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    configure_logging()        
    
    runner = CrawlerRunner({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })

    run_crawler()

    reactor.run()     

EDIT:

The same but now all crawlers run at the same time

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:
    
        runner.crawl(MySpider, keyword, page)
        runner.crawl(MySpider, keyword, int(page)+1)
        runner.crawl(MySpider, keyword, int(page)+2)
        
        d = runner.join()
        yield d

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return

Upvotes: 5

puh_dormouse
puh_dormouse

Reputation: 56

You can remove the while loop and use callbacks instead.

Edit: Example added:

def callback_f():
    # stuff #
    calling_f()

def calling_f():
    answer = input("Continue? (y/n)")
    if not answer == 'n':
        callback_f()
        
callback_f()

Upvotes: 0

Related Questions