Reputation: 45
['devicecount']
and ['released']
using the DeepL API?Here is my code:
import scrapy
from gsm.items import GsmItem
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
# LEVEL 1
def parse(self, response):
item = GsmItem()
gsms = response.xpath('//div[@class="st-text"]/table//tr[1]//td[1]')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
item['brandname'] = brandname
item['devicecount'] = devicecount
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': item,
'devicecount': item})
# LEVEL 2
def parse_allbranddevicesurl(self, response):
item = response.meta['brandname']
item = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'brandname': item,
'devicecount': item})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta= {'brandname': item,
'devicecount': item})
# LEVEL 3
def parse_detailpage(self, response):
item = response.meta['brandname']
item = response.meta['devicecount']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
item['phonename'] = phonename
item['released'] = released
yield item
I would appreciate an example of how to access the values stored in the items and pass it to a translation function.
Upvotes: 0
Views: 495
Reputation: 1121
Based on our exchange in the comments, it seems like the following function would satisfy your needs:
import deepl, scrapy
from typing import *
from gsm.items import GsmItem
AUTH_KEY = <YOUR_AUTH_KEY>
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
def translate_vals(data: Dict[str, str], keys: List[str], lang: str)-> Dict[str, str]
T = deepl.Translator(AUTH_KEY)
return {k: T.translate_text(text, target_lang=lang) for k,v in data.items() if isinstance(v, str) and v in keys}
# LEVEL 1
def parse(self, response):
item = GsmItem()
gsms = response.xpath('//div[@class="st-text"]/table//tr[1]//td[1]') # one brand --> adjust tr[1] & td[1]
# gsms = response.xpath('//div[@class="st-text"]/table//td') # all brands
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
item['brandname'] = brandname
item['devicecount'] = devicecount
translate_vals(item, ['brandname', 'devicecount'], 'fr')
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': item,
'devicecount': item})
# LEVEL 2
def parse_allbranddevicesurl(self, response):
item = response.meta['brandname']
item = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'brandname': item,
'devicecount': item})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta= {'brandname': item,
'devicecount': item})
# LEVEL 3
def parse_detailpage(self, response):
item = response.meta['brandname']
item = response.meta['devicecount']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
item['phonename'] = phonename
item['released'] = released
yield item
Then you'd just call it like translate_vals(Request.meta, ['devicecount', 'released'], 'fr')
(for example). You can sign up for a free DeepL AUTH_KEY
here.
Upvotes: 1