Reputation: 11
I need you help folks, to scrapy a text element which is encypted here is my spider
import json
import scrapy
class YPSpider(scrapy.Spider):
name = 'yp'
start_urls = ['https://www.infobel.com/fr/france/business/50000/informatique_internet/']
def parse(self, response):
next_page = response.xpath('//*[@rel="next"]').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse)
if response.meta.get('has_phone'):
item = response.meta['item']
response = json.loads(response.body)
item['phone'] = response['result']
yield item
else:
items = response.xpath('//*[contains(@class, "customer-box")]')
for item in items:
address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()
title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()
item = {
'title': title,
'address': address,
'village': village,
'phone': phone,
}
if phone:
if phone[0].isnumeric():
item['phone'] = phone[0]
yield item
elif len(phone) >= 2:
yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True}
)
My problem is that the returned phone string is encoded and need you help to get the the text Thank you in advance!
Upvotes: 0
Views: 213
Reputation: 5461
import json
import scrapy
class YPSpider(scrapy.Spider):
name = 'yp'
start_urls = ['http://www.infobel.com/fr/france/business/50000/informatique_internet/']
def parse(self, response):
pages = response.xpath('//ul[@class="pagination"]//*[@rel="next"]/@href').extract()
next_page = pages[-1] if pages else None
if next_page:
yield response.follow(next_page)
if response.meta.get('has_phone'):
item = response.meta['item']
response = json.loads(response.body)
item['phone'] = response['result']
yield item
else:
items = response.xpath('//*[contains(@class, "customer-box")]')
for item in items:
address_lines = item.xpath('.//span[contains(@class, "fa-map-marker")]/../span[@class="detail-text"]//text()').extract()
title = item.xpath('.//h2[@class="customer-item-name"]/a/text()').extract_first().strip()
address = address_lines[0].replace('\r', '').replace('\t', '').strip() if address_lines else ''
village = address_lines[1].replace('\r', '').replace('\t', '').strip() if len(address_lines) >= 1 else ''
phone = item.xpath('.//span[contains(@class, "icon-phone")]/../span[@class="detail-text"]/text()').extract()
item = {
'title': title,
'address': address,
'village': village,
'phone': phone,
}
if phone:
if phone[0].isnumeric():
item['phone'] = phone[0]
yield item
elif len(phone) >= 2:
yield scrapy.Request('https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={}'.format(phone[1]), meta={'item': item, 'has_phone': True})
Upvotes: 1
Reputation: 21436
Seems like the website is using their own internal AJAX calls to decrypt phone number strings; if you look at your web browser inspector:
You can replicate this request in scrapy:
from urllib.parse import quote
from scrapy import Request
def parse(self, response):
code = quote('iHB/1oF0m7ELfO6Mfsl+mvm+o8SZZ37q', safe='')
url = f"https://www.infobel.com/fr/france/Search/Decrypt?encryptedString={code}"
yield Request(url, body=json.dumps(data))
Upvotes: 0