Reputation: 2039
I'm using scrapy as a crawler in python. My problem is, that i can't start multiple crawljobs in parallel.
GetJob
def getJobs(self):
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
cur.execute("SELECT * FROM job WHERE status=0 OR days>0")
print "Get new jobs"
#JobModel
joblist=[]
for row in cur.fetchall():
job = JobModel.JobModel();
job.id = row[0]
job.user_id = row[1]
job.name = row[2]
job.url = row[3]
job.api = row[4]
job.max_pages = row[5]
job.crawl_depth = row[6]
job.processing_patterns = row[7]
job.status = row[8]
job.days = row[9]
job.ajax=row[11]
joblist.append(job);
#Proces the job now
for job in joblist:
processJob = ProcessJob.ProcessJob();
th=Thread(target=processJob.processJob,args=(job,))
th.daemon=True
th.start();
db.close()
ProcessJob
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
#Start new crawler
crawler = spider.MySpider;
print job.ajax;
if job.ajax == 1:
crawler.custom_settings = CrawlerSettings.ajax_settings;
else:
crawler.custom_settings = CrawlerSettings.normal_settings;
configure_logging()
runner = CrawlerRunner()
runner.crawl(crawler, job=job)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run(0)
Get Jobs retrieves every 5 seconds new Jobs from the database and gives them to processJobs. The problem is, when i Start multiple Crawljobs i get following exception:
Traceback (most recent call last):
File "/usr/local/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/local/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "/Users/fabianlurz/c_crawler/c_crawler/jobs/ProcessJob.py", line 31, in processJob
reactor.run(0)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1193, in run
self.startRunning(installSignalHandlers=installSignalHandlers)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 1173, in startRunning
ReactorBase.startRunning(self)
File "/usr/local/lib/python2.7/site-packages/twisted/internet/base.py", line 682, in startRunning
raise error.ReactorAlreadyRunning()
I already know that i can't start reactor twice - but there must be a way to have multiple crawling instances on one "server". So how can i accomplish that?
Upvotes: 0
Views: 1811
Reputation: 2039
Got it working
from billiard import Process
from model import CrawlerSettings
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from spiders import spider
from twisted.internet import reactor
from twisted.internet.protocol import Protocol
from utility import MysqlConnector
class ProcessJob():
def processJob(self, job):
#update job
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
job.status = 1
update = "UPDATE job SET status=1 WHERE id=" + str(job.id)
cur.execute(update)
db.commit()
db.close()
#Start new crawler
configure_logging()
webspider = spider.MySpider;
if job.ajax == 1:
webspider.custom_settings = CrawlerSettings.ajax_settings;
else:
webspider.custom_settings = CrawlerSettings.normal_settings;
crawler = UrlCrawlerScript(webspider, job)
crawler.start()
class UrlCrawlerScript(Process):
def __init__(self, spider, job):
Process.__init__(self)
self.crawler = CrawlerRunner()
self.crawler.crawl(spider, job=job)
def run(self):
d = self.crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run(0)
Using billard to spawn multiple processes
Upvotes: 1