ReactorAlreadyRunning Scrapy

Question

I'm using scrapy as a crawler in python. My problem is, that i can't start multiple crawljobs in parallel.

GetJob

def getJobs(self):
        mysql = MysqlConnector.Mysql()
        db = mysql.getConnection();
        cur = db.cursor();
        cur.execute("SELECT *  FROM job WHERE status=0 OR days>0")
        print "Get new jobs"
        #JobModel        
        joblist=[]
        for row in cur.fetchall():
            job = JobModel.JobModel();
            job.id = row[0]
            job.user_id = row[1]
            job.name = row[2]
            job.url = row[3]
            job.api = row[4]
            job.max_pages = row[5]
            job.crawl_depth = row[6]
            job.processing_patterns = row[7]
            job.status = row[8]
            job.days = row[9]
            job.ajax=row[11]
            joblist.append(job);

        #Proces the job now
        for job in joblist:
            processJob = ProcessJob.ProcessJob();
            th=Thread(target=processJob.processJob,args=(job,))
            th.daemon=True
            th.start();

            db.close()

ProcessJob

def processJob(self, job):
        #update job
        mysql = MysqlConnector.Mysql()
        db = mysql.getConnection();
        cur = db.cursor(); 
        job.status = 1
        update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
        cur.execute(update)
        db.commit()
        db.close()

        #Start new crawler
        crawler = spider.MySpider;
        print job.ajax;
        if job.ajax == 1:
            crawler.custom_settings = CrawlerSettings.ajax_settings;
        else:
            crawler.custom_settings = CrawlerSettings.normal_settings;
        configure_logging()
        runner = CrawlerRunner()
        runner.crawl(crawler, job=job)
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run(0)

Get Jobs retrieves every 5 seconds new Jobs from the database and gives them to processJobs. The problem is, when i Start multiple Crawljobs i get following exception:

    Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
    self.run()
  File "/usr/local/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 763, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/Users/fabianlurz/c_crawler/c_crawler/jobs/ProcessJob.py", line 31, in processJob
    reactor.run(0)
  File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1193, in run
    self.startRunning(installSignalHandlers=installSignalHandlers)
  File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1173, in startRunning
    ReactorBase.startRunning(self)
  File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 682, in startRunning
    raise error.ReactorAlreadyRunning()

I already know that i can't start reactor twice - but there must be a way to have multiple crawling instances on one "server". So how can i accomplish that?

Fabian Lurz · Accepted Answer

Got it working

from billiard import Process
from model import CrawlerSettings
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from spiders import spider
from twisted.internet import reactor
from twisted.internet.protocol import Protocol
from utility import MysqlConnector


class ProcessJob():
    def processJob(self, job):
        #update job
        mysql = MysqlConnector.Mysql()
        db = mysql.getConnection();
        cur = db.cursor(); 
        job.status = 1
        update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
        cur.execute(update)
        db.commit()
        db.close()

        #Start new crawler
        configure_logging()
        webspider = spider.MySpider;   
        if job.ajax == 1:
            webspider.custom_settings = CrawlerSettings.ajax_settings;
        else:
            webspider.custom_settings = CrawlerSettings.normal_settings;
        crawler = UrlCrawlerScript(webspider, job)
        crawler.start()


class UrlCrawlerScript(Process):
    def __init__(self, spider, job):
        Process.__init__(self)
        self.crawler = CrawlerRunner()
        self.crawler.crawl(spider, job=job)  

    def run(self):
        d = self.crawler.join()
        d.addBoth(lambda _: reactor.stop())       
        reactor.run(0)

Using billard to spawn multiple processes

ReactorAlreadyRunning Scrapy

Answers (1)

Related Questions