Reputation: 3
from scrapy import Spider
from scrapy.http import Request
class CourseSpider(Spider):
name = 'course'
allowed_domains = ['coursera.org']
start_urls = ['https://coursera.org/about/partners']
def parse(self, response):
listings = response.xpath('//div[@class="rc-PartnerBox vertical-box"]')
for listing in listings:
title = listing.xpath('.//div[@class="partner-box-wrapper card-one-clicker flex-1"]/p').extract_first()
relative_url = listing.xpath('.//a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
yield Request(response.urljoin(relative_url), callback = self.parse_listing,meta={'title':title,'absolute_url':absolute_url})
def parse_listing(self,response):
titles = response.meta.get('title')
absolute_url = response.meta.get('absolute_url')
titles_course = response.xpath('//div[@class="name headline-1-text"]/text()').extract()
url_link = response.xpath('//div[@class="rc-Course"]/a/@href').extract()
abs_url = response.urljoin(url_link)
yield {'title':title,
'titles':title,
'absolute_url':absolute_url,
'titles_course':titles_course,
'abs_url':abs_url}
However, upon running the script through the cmd. I am getting errors. These errors suggest that I cannot mix str and non-str arguments and I am confused over how to deal with this problem. Any help would be appreciated.
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 28, in parse_listing
yield {'title':title,
NameError: name 'title' is not defined
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/checkpoint> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
abs_url = response.urljoin(url_link)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
return urljoin(get_base_url(self), url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
yield next(it)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
return next(self.data)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
abs_url = response.urljoin(url_link)
File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
return urljoin(get_base_url(self), url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] INFO: Closing spider (finished)
I tried adding the extract() function since it was mentioned on some previous stackoverflow question on the listings container to get rid of that error but then my xpath is not getting the desired output.
Upvotes: 0
Views: 1693
Reputation: 33223
You are looking for .extract_first()
or its new name .get()
, because .extract()
produces a list, which one cannot use in .urljoin
Upvotes: 1