Reputation: 63
I already read different articles to my problem but it is still not working. Basically, I am using Scrapy and Selenium to crawl websites. The URLs for this website are currently saved in a text file. This text file just consists of one column. In each row in this column there is a URL.
I am still getting an error message: selenium.common.exceptions.InvalidArgumentException: Message: invalid argument: 'url' must be a string
This is my current code:
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
with open ("urls.txt", "rt") as f:
start_urls = [l.strip() for l in open('urls.txt').readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
self.driver.get(start_urls)
self.driver.find_element_by_id('currentTab').click()
[UPDATED]
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from time import sleep
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import csv
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[@id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(self.driver.current_url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[@class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[@class="col-sm-7 col-lg-6 no-padding"]/a/@title').extract()
publish_date = post.xpath('.//*[@class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
Upvotes: 0
Views: 181
Reputation: 50819
start_urls
is a list, not str
. You need to iterate over it. You also don't need to open the file twice
def start_requests(self):
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
Upvotes: 2