Reputation: 41
I have deployed Scrapyd as docker conainter on Google CloudRun. On my local, when I am running container, everything is working fine. But, when I am deploying same container on Google CloudRun, Spider jobs are not removed from Running queue. Though Jobs are finished but they are not being removed from Queue. Any thoughts?
Upvotes: 0
Views: 276
Reputation: 1
i meet the same issue, when deployed scrapyd on cloud run. The reason may be the sub proccessor exited , but processEnded method not invoked, when add processExited method to kill zhe slot, the issue solved!
i temply solve it by using my own launcher:
class Launcher(Service):
name = 'launcher'
def __init__(self, config, app):
self.processes = {}
self.finished = app.getComponent(IJobStorage)
self.max_proc = self._get_max_proc(config)
self.runner = config.get('runner', 'scrapyd.runner')
self.app = app
def startService(self):
for slot in range(self.max_proc):
self._wait_for_project(slot)
log.msg(
format=
'Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r',
version=__version__,
max_proc=self.max_proc,
runner=self.runner,
system='Launcher')
def _wait_for_project(self, slot):
poller = self.app.getComponent(IPoller)
poller.next().addCallback(self._spawn_process, slot)
def _spawn_process(self, message, slot):
msg = native_stringify_dict(message, keys_only=False)
project = msg['_project']
args = [sys.executable, '-m', self.runner, 'crawl']
args += get_crawl_args(msg)
e = self.app.getComponent(IEnvironment)
env = e.get_environment(msg, slot)
env = native_stringify_dict(env, keys_only=False)
pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
msg['_job'], env)
pp.deferred.addBoth(self._process_finished, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
self.processes[slot] = pp
def _process_finished(self, _, slot):
process = self.processes.pop(slot)
process.end_time = datetime.now()
self.finished.add(process)
self._wait_for_project(slot)
def _get_max_proc(self, config):
max_proc = config.getint('max_proc', 0)
if not max_proc:
try:
cpus = cpu_count()
except NotImplementedError:
cpus = 1
max_proc = cpus * config.getint('max_proc_per_cpu', 4)
return max_proc
class ScrapyProcessProtocol(protocol.ProcessProtocol):
def __init__(self, slot, project, spider, job, env):
self.slot = slot
self.pid = None
self.project = project
self.spider = spider
self.job = job
self.start_time = datetime.now()
self.end_time = None
self.env = env
self.logfile = env.get('SCRAPY_LOG_FILE')
self.itemsfile = env.get('SCRAPY_FEED_URI')
self.deferred = defer.Deferred()
def outReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stdout" % self.pid)
def errReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stderr" % self.pid)
def connectionMade(self):
self.pid = self.transport.pid
self.log("Process started: ")
def processEnded(self, status):
if isinstance(status.value, error.ProcessDone):
self.log("Process finished: ")
else:
self.log("Process died: exitstatus=%r " % status.value.exitCode)
self.deferred.callback(self)
# on cloud run processEnded not invoked, but processExited
def processExited(self, status):
self.processEnded(status)
def log(self, action):
fmt = '%(action)s project=%(project)r spider=%(spider)r job=%(job)r pid=%(pid)r log=%(log)r items=%(items)r'
log.msg(format=fmt, action=action, project=self.project, spider=self.spider,
job=self.job, pid=self.pid, log=self.logfile, items=self.itemsfile)
finally , you should change the scrapyd.conf file:
launcher ={your launcher module}.Launcher
Upvotes: 0
Reputation: 1142
As mentioned in the Github :
Close the webdriver in spider close function as follows:
def __init__(self, *args, **kwargs):
# webkit driver
self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS, service_log_path='/tmp/ghostdriver.log')
self.driver.implicitly_wait(1)
self.driver.set_page_load_timeout(3)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(NewsDuowanSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=scrapy.signals.spider_closed)
return spider
def spider_closed(self, spider):
spider.logger.info('Spider closed: %s', spider.name)
spider.driver.quit()
For more information, you can refer to the link and documentation.
Upvotes: 0