Reputation: 45
So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
But I get an error saying reactor is not restartable.
Traceback (most recent call last):
File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
process.start()
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
ReactorBase.startRunning(cast(ReactorBase, self))
File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable
So I guess using while loop is no-go. I don't know where to even start...
Upvotes: 4
Views: 1900
Reputation: 21
You can run spiders in a loop by installing the reactor at the top level before other scrapy or reactor imports, then deleting the reactor after each crawl. This worked for me:
main.py
import time
from spider_utils import run_crawler
while 1:
run_crawler('spider1')
run_crawler('spider2')
time.sleep(60)
spider_utils.py
from scrapy.utils.reactor import install_reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
def run_crawler(spider_name: str):
"""Run isolated spider and restart reactor to run another spider afterwards."""
process = CrawlerProcess(get_project_settings())
process.crawl(spider_name)
process.start()
import sys
del sys.modules['twisted.internet.reactor']
Upvotes: 2
Reputation: 1
from twisted.internet import reactor #only this is supposed to be here, we will be deleting the reactor after each run, using the main
configure_logging()
settings = get_project_settings()
runner = CrawlerRunner(settings)
d = runner.crawl('your spider class name')
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
del sys.modules['twisted.internet.reactor'] #deleting the reactor, because we want to run a for loop, the reactor will be imported again at the top
default.install()
Upvotes: 0
Reputation: 142651
Method 1:
scrapy
creates Reactor
which can't be reused after stop
but if you will run Crawler
in separated process then new process will have to create new Reactor
.
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
# --- main ---
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
It will not work if you use threading
instead of multiprocessing
because threads share variables so new thread will use the same Reactor
as previous thread.
Minimal working code (tested on Linux).
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
#from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
Method 2:
Found in Google: Restarting a Twisted Reactor.
It is old post which uses del
to remove module twisted
from memory and later it imports
it again.
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Minimal working code (tested on Linux)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
Method 3:
It seems you could use use CrawlRunner instead of CrawlProcess
- but I didn't test it yet.
Base on last example in doc for Running multiple spiders in the same process I created code which runs while
-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
#start_urls = ['https://books.toscrape.com/']
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
# --- run without project and save in `output.csv` ---
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
# --- main ---
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
run_crawler()
reactor.run()
EDIT:
The same but now all crawlers run at the same time
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
#return
Upvotes: 5
Reputation: 56
You can remove the while
loop and use callbacks instead.
Edit: Example added:
def callback_f():
# stuff #
calling_f()
def calling_f():
answer = input("Continue? (y/n)")
if not answer == 'n':
callback_f()
callback_f()
Upvotes: 0