Scrapy. First response requires Selenium

Question

I'm scraping a website that strongly depends on Javascript. The main page from which I need to extract the urls that will be parsed depends on Javascript, so I have to modify start_requests. I'm looking for a way to connect start_requests, with the linkextractor and with process_match

class MatchSpider(CrawlSpider):
    name = "match"
    allowed_domains = ["whoscored"]
    rules = (
    Rule(LinkExtractor(restrict_xpaths='//*[contains(@class, "match-report")]//@href'), callback='parse_item'),
)

    def start_requests(self):
        url = 'https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/Fixtures/England-Premier-League-2016-2017'
        browser = Browser(browser='Chrome')
        browser.get(url)
        # should return a request with the html body from Selenium driver so that LinkExtractor rule can be applied


    def process_match(self, response):
        match_item = MatchItem()
        regex = re.compile("matchCentreData = \{.*?\};", re.S)
        match = re.search(regex, response.text).group()
        match = match.replace('matchCentreData =', '').replace(';', '')
        match_item['match'] = json.loads(match)
        match_item['url'] = response.url
        match_item['project'] = self.settings.get('BOT_NAME')
        match_item['spider'] = self.name
        match_item['server'] = socket.gethostname()
        match_item['date'] = datetime.datetime.now()
        yield match_item

A wrapper I'm using around Selenium:

class Browser:
    """
    selenium on steroids. allows you to create different types of browsers plus
    adds methods for safer calls
    """
    def __init__(self, browser='Firefox'):
        """
        type: silent or not
        browser: chrome of firefox
        """
        self.browser = browser
        self._start()

    def _start(self):
        '''
        starts browser
        '''
        if self.browser == 'Chrome':
            chrome_options = webdriver.ChromeOptions()
            prefs = {"profile.managed_default_content_settings.images": 2}
            chrome_options.add_extension('./libcommon/adblockpluschrome-1.10.0.1526.crx')
            chrome_options.add_experimental_option("prefs", prefs)
            chrome_options.add_argument("user-agent={0}".format(random.choice(USER_AGENTS)))
            self.driver_ = webdriver.Chrome(executable_path='./libcommon/chromedriver', chrome_options=chrome_options)
        elif self.browser == 'Firefox':
            profile = webdriver.FirefoxProfile()
            profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
            profile.add_extension('./libcommon/adblock_plus-2.7.1-sm+tb+an+fx.xpi')
            profile.set_preference('permissions.default.image', 2)
            profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
            profile.set_preference("webdriver.load.strategy", "unstable")
            self.driver_ = webdriver.Firefox(profile)
        elif self.browser == 'PhantomJS':
            self.driver_ = webdriver.PhantomJS()
            self.driver_.set_window_size(1120, 550)

    def close(self):
        self.driver_.close()

    def return_when(self, condition, locator):
        """
        returns browser execution when condition is met
        """
        for _ in range(5):
            with suppress(Exception):
                wait = WebDriverWait(self.driver_, timeout=100, poll_frequency=0.1)
                wait.until(condition(locator))
                self.driver_.execute_script("return window.stop")
                return True
        return False

    def __getattr__(self, name):
        """
        ruby-like method missing: derive methods not implemented to attribute that
        holds selenium browser
        """
        def _missing(*args, **kwargs):
            return getattr(self.driver_, name)(*args, **kwargs)
        return _missing

Scrapy. First response requires Selenium

Answers (1)

Related Questions