Reputation: 601
I am new with crawl and scrapy, I am trying to extract some news from https://www.lacuarta.com/, also just news that match the tag san-valentin.
The webpage is just the titles with an image of the news, if you want to read it you have to click on the news and it will take ypu to the page of the story (https://www.lacuarta.com/etiqueta/san-valentin/)
So, I am thinking mi steps are:
I already have the points 1 and 2:
import scrapy
class SpiderTags(scrapy.Spider):
name = "SpiderTags"
def start_requests(self):
url = 'https://www.lacuarta.com/etiqueta/'
tag = getattr(self, 'tag', None)
if tag is not None:
url = url + 'etiqueta/' + tag
yield scrapy.Request(url, self.parse)
def parse(self, response):
for url in response.css("h4.normal a::attr(href)"):
yield{
"link:": url.get()
}
Up to here I have the links to the news, now I can't figure out how to enter in that news for extracting the data I want and then returning to my original webpage to go page number 2 and repeat everything
PD: the info I want already know how to get it
response.css("title::text").get()
response.css("div.col-md-11 p::text").getall()
response.css("div.col-sm-6 h4 a::text").getall()
response.css("div.col-sm-6 h4 small span::text").getall()
Upvotes: 1
Views: 1358
Reputation: 57
import scrapy
from scrapy.spiders import CrawlSpider
class SpiderName(CrawlSpider):
name = 'spidername'
allowed_domains = ['lacuarta.com']
start_urls = ['https://www.lacuarta.com/etiqueta/san-valentin/']
def parse(self, response):
for item in response.xpath('//article[@class="archive-article modulo-fila"]'):
# maybe you need more data whithin `item`
post_url = item.xpath('.//h4/a/@href').extract_first()
yield response.follow(post_url, callback=self.post_parse)
next_page = response.xpath('//li[@class="active"]/following-sibling::li/a/@href').extract_first()
if next_page:
yield response.follow(next_page, callback=self.parse)
def post_parse(self, response):
title = response.xpath('//h1/text()').extract_first()
story = response.xpath('//div[@id="ambideXtro"]/child::*').extract()
author = response.xpath('//div[@class="col-sm-6 m-top-10"]/h4/a/text()').extract_first()
date = response.xpath('//span[@class="ltpicto-calendar"]').extract_first()
yield {'title': title, 'story': story, 'author': author, 'date': date}
Upvotes: 1
Reputation: 2536
You need to yield
a new Request
in order to follow the link. For example:
def parse(self, response):
for url in response.css("h4.normal a::attr(href)"):
# This will get the URL value, not follow it:
# yield{
# "link:": url.get()
# }
# This will follow the URL:
yield scrapy.Request(url.get(), self.parse_news_item)
def parse_news_item(self, response):
# Extract things from the news item page.
yield {
'Title': response.css("title::text").get(),
'Story': response.css("div.col-md-11 p::text").getall(),
'Author': response.css("div.col-sm-6 h4 a::text").getall(),
'Date': response.css("div.col-sm-6 h4 small span::text").getall(),
}
Upvotes: 3