Reputation: 35
I need to write data to a csv file, I am currently engaged in parsing an online store, there is a different number of characteristics on each product, for example: weight, length, etc. I am trying to write data using pandas, but I can't write all the data correctly into the dictionary, tell me how to do it correctly
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import time
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
URLS = ['https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/','https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/']
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
content = []
colum = []
driver = webdriver.Chrome(ChromeDriverManager().install())
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_specifications_parameter():
num = 0
WebDriverWait(driver, 5).until(expected_conditions.visibility_of_element_located((By.XPATH, '//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button')))
driver.find_element_by_xpath('//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button').click()
time.sleep(3)
table = driver.find_element_by_xpath('//*[@id="range-modal-mount-node"]/div/div[4]/div/div[2]/div/div/div').get_attribute('innerHTML')
soup = BeautifulSoup(table,'html.parser')
param = soup.find_all('dd',class_='range-revamp-product-dimensions__list-item-measure')
titles = soup.find_all('dt',class_='range-revamp-product-dimensions__list-item-name')
for item in titles:
if item.text in colum:
pass
else:
colum.append(item.text)
for item in param:
content.append({titles[num].text:item.text}) #Writing characteristics to content
num+=1
print(content)
def get_content(url):
driver.get(url)
get_specifications_parameter()
# content.append[{
# 'name':name,
# 'price':price,
# 'photo':photo,
# 'description':description
# }]
# Additional data to be recorded
print(content)
def start():
for URL in URLS:
html = get_html(URL)
if html.status_code == 200:
get_content(URL)
else:
print('Network error')
def write():
df = pd.DataFrame(colum)
for p in content:
df = pd.concat([df, pd.DataFrame(p,index=[0])],ignore_index=True)
df.to_csv("output.csv", index=False)
start()
write()
At the output to the content, I get this
[
{"Ширина:\xa0": "128 см"},
{"Глубина:\xa0": "58 см"},
{"Мин высота:\xa0": "59 см"},
{"Макс высота:\xa0": "72 см"},
{"Макс нагрузка:\xa0": "50 кг"},
]
[
{"Ширина:\xa0": "128 см"},
{"Глубина:\xa0": "58 см"},
{"Мин высота:\xa0": "59 см"},
{"Макс высота:\xa0": "72 см"},
{"Макс нагрузка:\xa0": "50 кг"},
]
[
{"Ширина:\xa0": "128 см"},
{"Глубина:\xa0": "58 см"},
{"Мин высота:\xa0": "59 см"},
{"Макс высота:\xa0": "72 см"},
{"Макс нагрузка:\xa0": "50 кг"},
{"Ширина:\xa0": "73 см"},
{"Глубина:\xa0": "50 см"},
{"Высота:\xa0": "75 см"},
{"Макс нагрузка:\xa0": "50 кг"},
]
[
{"Ширина:\xa0": "128 см"},
{"Глубина:\xa0": "58 см"},
{"Мин высота:\xa0": "59 см"},
{"Макс высота:\xa0": "72 см"},
{"Макс нагрузка:\xa0": "50 кг"},
{"Ширина:\xa0": "73 см"},
{"Глубина:\xa0": "50 см"},
{"Высота:\xa0": "75 см"},
{"Макс нагрузка:\xa0": "50 кг"},
]
And I want to separate this
products = [
{
"артикул": 12345,
"высота": 50,
"материал": "дерево",
},
{
"артикул": 12346,
"ширина": 30,
"вес": 1.5,
},
{
"артикул": 12347,
"длина": 14,
"высота": 6.2,
"материал": "пластик",
},
]
To get such a file as a link https://drive.google.com/file/d/1uGoW1kpsDGDA-Zh7SiiCDcg9cf2lHQUd/view?usp=sharing
Upvotes: 0
Views: 240
Reputation: 169378
You don't need Pandas to write CSV to a file.
For this case, you don't need Selenium either.
import csv
import sys
from typing import List
import requests
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"
}
def get_html(url, params=None) -> str:
r = requests.get(url, headers=HEADERS, params=params)
r.raise_for_status()
return r.text
def get_specifications_parameter(url: str) -> dict:
html = get_html(url)
soup = BeautifulSoup(html, "html.parser")
param = soup.find_all("dd", class_="range-revamp-product-dimensions__list-item-measure")
titles = soup.find_all("dt", class_="range-revamp-product-dimensions__list-item-name")
content = {}
for title, value in zip(titles, param):
content[title.text.strip("\xA0:")] = value.text
return content
def scrape_urls(urls: List[str]) -> List[dict]:
contents = []
for url in urls:
content = get_specifications_parameter(url)
content["url"] = url
print(content, file=sys.stderr) # progress printing
contents.append(content)
return contents
def write_output(contents: List[dict]):
# Figure out all keys in the content for CSV writer
all_keys = set()
for content in contents:
all_keys |= set(content)
# Write to standard output (could be a file too)
w = csv.DictWriter(sys.stdout, all_keys)
w.writeheader()
for content in contents:
w.writerow(content)
def main():
urls = [
"https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/",
"https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/",
]
contents = scrape_urls(urls)
write_output(contents)
if __name__ == "__main__":
main()
outputs
{'Ширина': '128 см', 'Глубина': '58 см', 'Мин высота': '59 см', 'Макс высота': '72 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/'}
{'Ширина': '73 см', 'Глубина': '50 см', 'Высота': '75 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/'}
as debug information, followed by
url,Макс нагрузка,Ширина,Высота,Мин высота,Макс высота,Глубина
https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/,50 кг,128 см,,59 см,72 см,58 см
https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/,50 кг,73 см,75 см,,,50 см
Upvotes: 1