Reputation: 57
I'm trying to scrape some ASINs(lets say 600 ASINs) from amazon website(just the ASINs) with selenium and beautifulsoup.My main issue is how to save all the scraped data into a CSV file ? I've tried something but it only saves the last scraped page.
Here is the code:
from time import sleep
import requests
import time
import json
import re
import sys
import numpy as np
from selenium import webdriver
import urllib.request
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import pandas as pd
from urllib.request import urlopen
i = 1
while(True):
try:
if i == 1:
url = "https://www.amazon.es/s?k=doll&i=toys&rh=n%3A599385031&dc&page=1"
else:
url = "https://www.amazon.es/s?k=doll&i=toys&rh=n%3A599385031&dc&page={}".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
#print page url
print(url)
#rest of the scraping code
driver = webdriver.Chrome()
driver.get(url)
HTML = driver.page_source
HTML1=driver.page_source
soup = BeautifulSoup(HTML1, "html.parser")
styles = soup.find_all(name="div", attrs={"data-asin":True})
res1 = [i.attrs["data-asin"] for i in soup.find_all("div") if i.has_attr("data-asin")]
print(res1)
data_record.append(res1)
#driver.close()
#don't overflow website
sleep(1)
#increase page number
i += 1
if i == 3:
print("STOP!!!")
break
except:
break
Upvotes: 1
Views: 2308
Reputation: 1253
Removing items that do not seem to be used at the moment a possible solution could be
import csv
import bs4
import requests
from selenium import webdriver
from time import sleep
def retrieve_asin_from(base_url, idx):
url = base_url.format(idx)
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content, 'html.parser')
with webdriver.Chrome() as driver:
driver.get(url)
HTML1 = driver.page_source
soup = bs4.BeautifulSoup(HTML1, "html.parser")
res1 = [i.attrs["data-asin"]
for i in soup.find_all("div") if i.has_attr("data-asin")]
sleep(1)
return res1
url = "https://www.amazon.es/s?k=doll&i=toys&rh=n%3A599385031&dc&page={}"
data_record = [retrieve_asin_from(url, i) for i in range(1, 4)]
combined_data_record = combine_records(data_record) # fcn to write
with open('asin_data.csv', 'w', newline='') as fd:
csvfile = csv.writer(fd)
csvfile.writerows(combined_data_record)
Upvotes: 1