Reputation: 124
I created Json file in python to store scraped data using scrapy , but json file is empty although python scrapy spider scraped all data.i'm trying to store all scraped data into json file.in terminal on crawl command spider display all data but it did not import into json file. i can't find any solution i'm sharing both files spider and also items.py
i use this command scrapy crawl scraper -o products.json
Spider.py
import scrapy
from bs4 import BeautifulSoup as Soup
from ..items import ScrapyArbiItem
import requests
from idna import unicode
class Scraper(scrapy.Spider):
name = "scraper"
start_urls = [
'https://www.fenom.com/en/263-men',
# 'https://www.fenom.com/en/263-men#/page-2',
# 'https://www.fenom.com/en/263-men#/page-3',
# 'https://www.fenom.com/en/263-men#/page-4',
# 'https://www.fenom.com/en/263-men#/page-5',
# 'https://www.fenom.com/en/263-men#/page-6',
# 'https://www.fenom.com/en/263-men#/page-7',
]
def parse(self, response):
items = ScrapyArbiItem()
page_soup = Soup(response.text, 'html.parser')
uls = page_soup.find_all("ul", class_="product_list grid row")[0]
# import pdb;
# pdb.set_trace()
for li in uls.find_all("li", class_="ajax_block_product block_home col-xs-6 col-sm-4
col-md-3"):
data_to_write = []
try:
# print("gnbfrgjrnbgfjnbruigbnruig")
div = li.find('div', class_='product-container')
left_block = div.find('div', class_="left-block")
image_container = left_block.find('div', class_="product-image-container")
image = image_container.find('a')
image_url_a = image_container.find('a', class_="product_img_link")
image_url = image_url_a.find('img', class_='replace-2x img-responsive')
image_url = image_url.get('src') # image_url
url = image.get('href') # url of product
right_block = div.find('div', class_="right-block")
right_a = right_block.find('a')
product = right_a.find('span', class_="product-name")
product_name = product.text # product_name
pp = right_a.find('span', class_="content_price")
product_p = pp.find('span', class_="product-price")
product_price = product_p.text # product_price
items ['product_name'] = product_name
items['product_price'] = product_price
items['url'] = url
print(items)
#print(product_name)
#print(product_price)
#print(url)
#print(image_url)
next_page = url
# import pdb;pdb.set_trace()
# print(url)
# if url:
# yield scrapy.Request(url, callback=self.parsetwo, dont_filter=True)
except:
pass
items.py
in this file is to arrange all extracted data into temporary containers
import scrapy
class ScrapyArbiItem(scrapy.Item):
# define the fields for your item here like:
product_name = scrapy.Field()
product_price = scrapy.Field()
url = scrapy.Field()
Upvotes: 1
Views: 452
Reputation: 124
i use yield(items) instead of print(items) and it resolve issue.
`import scrapy
from bs4 import BeautifulSoup as Soup
from ..items import ScrapyArbiItem
import requests
from idna import unicode
class Scraper(scrapy.Spider):
name = "scraper"
page_number = 2 #for paginatiom
start_urls = [
'https://www.fenom.com/en/263-men#/page-1', #firstpage
]
def parse(self, response):
items = ScrapyArbiItem() #for items container-storing extracted data
page_soup = Soup(response.text, 'html.parser')
uls = page_soup.find_all("ul", class_="product_list grid row")[0]
for li in uls.find_all("li", class_="ajax_block_product block_home col-xs-6 col-sm-4 col-md-3"):
try:
# print("gnbfrgjrnbgfjnbruigbnruig")
div = li.find('div', class_='product-container')
left_block = div.find('div', class_="left-block")
image_container = left_block.find('div', class_="product-image-container")
image = image_container.find('a')
image_url_a = image_container.find('a', class_="product_img_link")
image_url = image_url_a.find('img', class_='replace-2x img-responsive')
image_url = image_url.get('src') # image_url
url = image.get('href') # url of product
right_block = div.find('div', class_="right-block")
right_a = right_block.find('a')
product = right_a.find('span', class_="product-name")
product_name = product.text # product_name
pp = right_a.find('span', class_="content_price")
product_p = pp.find('span', class_="product-price")
product_price = product_p.text # product_price
items ['product_name'] = product_name
items['product_price'] = product_price
items['url'] = url
yield (items)
#print(product_name)
#print(product_price)
#print(url)
#print(image_url)
except:
pass`
Upvotes: 2
Reputation: 91
Looks like all you need to do now is return the items
object and you're good to go.
Upvotes: 1