Reputation: 2039
I'm using a selenium middleware in my scrapy crawler:
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
class JSMiddleware(object):
def __init__(self):
dcaps = dict(DesiredCapabilities.PHANTOMJS)
service = ['--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false']
dcaps = {'handlesAlerts': False, 'javascriptEnabled': True, 'takesScreenshot':False}
dcaps["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36")
self.driver = webdriver.PhantomJS(desired_capabilities=dcaps, service_args=service)
self.driver.set_window_size(1120, 550)
self.driver.set_page_load_timeout(15)
def ajax_complete(self, driver):
jquery=False
jscomplete=False
try:
jquery = (0 == driver.execute_script("return jQuery.active"))
except WebDriverException:
pass
try:
if driver.execute_script("return document.readyState") == "complete":
jscomplete = True
except WebDriverException:
pass
return jquery & jscomplete
def process_request(self, request, spider):
self.driver.get(request.url)
WebDriverWait(self.driver, 20).until(
self.ajax_complete, "Wait till loaded")
body = self.driver.page_source
response = HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
return response
When the crawler is done it is waiting for new incoming jobs (basically retrieving job information via mysql).
The problem is, that phantomjs stays open and so it creates a memory leak. How and where should i close it?
Here is my spider:
from bs4 import BeautifulSoup
from items import Item
from jobs import DoneJob
from model import CrawlerSettings
import re
from readability.readability import Document
from scrapy.exceptions import CloseSpider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
from urlparse import urlparse
from utility import MysqlConnector
class MySpider(CrawlSpider):
def __init__(self, job):
self.counter = 0
self.itemCounter = 0
#Get the hosts
self.job = job
allowedDomainsPre = job.url.split(",")
allowedDomains = []
for domains in allowedDomainsPre:
parsed_uri = urlparse(domains)
domain = '{uri.netloc}'.format(uri=parsed_uri)
print "DOMAIN"
print domain
allowedDomains.append(domain)
self.allowed_domains = allowedDomains
self.start_urls = allowedDomainsPre
#Get job patterns
jobPatterns = job.processing_patterns.split(",")
allowedPatterns = []
deniedPatterns = []
for pattern in jobPatterns:
if '-' in pattern:
deniedPatterns.append(pattern.replace("-", ""))
else:
allowedPatterns.append(pattern)
self._rules = [
Rule(LinkExtractor(allow=(allowedPatterns), deny=(deniedPatterns)), callback=self.parse_items, follow=True)
]
self.name = job.id
self.settings = CrawlerSettings.normal_settings
def closed(self, spider):
#stats = spider.crawler.stats.get_stats()
itemCount = 0
if self.itemCounter:
itemCount = self.itemCounter
DoneJob.DoneJob().jobDone(self.job, itemCount)
def parse_items(self, response):
item = Item()
if self.counter >= 30:
self.checkActive()
#if the user wants a minimum description
if self.job.min_description > 0:
item['html'] = response.body
item['url'] = response.url
#Job
item['job'] = {}
item['job']['id'] = self.job.id
item['job']['user_id'] = self.job.user_id
item['job']['name'] = self.job.name
item['job']['url'] = self.job.url
item['job']['api'] = self.job.api
item['job']['max_pages'] = self.job.max_pages
item['job']['crawl_depth'] = self.job.crawl_depth
item['job']['processing_patterns'] = self.job.processing_patterns
item['job']['days'] = self.job.days
item['job']['ajax'] = self.job.ajax
item['job']['min_description'] = self.job.min_description
soup = BeautifulSoup(response.body, 'html.parser')
article = Document(soup.prettify()).summary()
article_soup = BeautifulSoup(article)
text = re.sub(' +', ' ', article_soup.get_text().rstrip())
text_length = len(text.split(' '))
if text_length > self.job.min_description:
self.counter = self.counter + 1
self.itemCounter=self.itemCounter+1
return item
else:
item['html'] = response.body
item['url'] = response.url
item['job'] = {}
#Job
item['job']['id'] = self.job.id
item['job']['user_id'] = self.job.user_id
item['job']['name'] = self.job.name
item['job']['url'] = self.job.url
item['job']['api'] = self.job.api
item['job']['max_pages'] = self.job.max_pages
item['job']['crawl_depth'] = self.job.crawl_depth
item['job']['processing_patterns'] = self.job.processing_patterns
item['job']['days'] = self.job.days
item['job']['ajax'] = self.job.ajax
item['job']['min_description'] = self.job.min_description
self.counter = self.counter + 1
self.itemCounter=self.itemCounter+1
return item
def checkActive(self):
self.counter = 0
mysql = MysqlConnector.Mysql()
db = mysql.getConnection();
cur = db.cursor();
cur.execute("SELECT status FROM job WHERE id=" + str(self.job.id))
for row in cur.fetchall():
status = int(row[0])
break
db.close()
if status == 3:
raise CloseSpider(reason='Job cancelled')
And here is my init:
from jobs import GetJob
import time
from twisted.internet import reactor
from twisted.internet import task
def schedule():
jobs = GetJob.Job()
jobs.getJobs()
if __name__ == "__main__":
t = task.LoopingCall(schedule)
t.start(15)
reactor.run()
Upvotes: 1
Views: 442
Reputation: 18799
I think you could use signals to do some actions when the spider ends:
class JSMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
...
crawler.signals.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
# actions when spider ends
...
Upvotes: 2