Writing data to. csv pandas

I need to write data to a csv file, I am currently engaged in parsing an online store, there is a different number of characteristics on each product, for example: weight, length, etc. I am trying to write data using pandas, but I can't write all the data correctly into the dictionary, tell me how to do it correctly

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import time
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd

URLS = ['https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/','https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/']
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

content = []
colum = []
driver = webdriver.Chrome(ChromeDriverManager().install())

def get_html(url, params=None):
    r = requests.get(url, headers=HEADERS, params=params)
    return r


def get_specifications_parameter():
    num = 0
    WebDriverWait(driver, 5).until(expected_conditions.visibility_of_element_located((By.XPATH, '//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button')))

    driver.find_element_by_xpath('//*[@id="content"]/div/div[1]/div/div[2]/div[2]/div[2]/div[2]/button').click()
    time.sleep(3)
    table = driver.find_element_by_xpath('//*[@id="range-modal-mount-node"]/div/div[4]/div/div[2]/div/div/div').get_attribute('innerHTML')
    soup = BeautifulSoup(table,'html.parser')
    
    param = soup.find_all('dd',class_='range-revamp-product-dimensions__list-item-measure')
    titles = soup.find_all('dt',class_='range-revamp-product-dimensions__list-item-name')
    for item in titles:
        if item.text in colum:
            pass
        else:
            colum.append(item.text)
    for item in param:
        content.append({titles[num].text:item.text}) #Writing characteristics to content
        num+=1
    print(content)

    
def get_content(url):
    driver.get(url)
    get_specifications_parameter()

    # content.append[{
    #     'name':name,
    #     'price':price,
    #     'photo':photo,
    #     'description':description
    #     }]
    #  Additional data to be recorded 
    print(content)

def start():
    for URL in URLS:
        html = get_html(URL)
        if html.status_code == 200:
            get_content(URL)
        else:
            print('Network error')

def write():
    df = pd.DataFrame(colum)
    for p in content:
        df = pd.concat([df, pd.DataFrame(p,index=[0])],ignore_index=True)
    df.to_csv("output.csv", index=False)


start()
write()

At the output to the content, I get this

[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]
[
    {"Ширина:\xa0": "128 см"},
    {"Глубина:\xa0": "58 см"},
    {"Мин высота:\xa0": "59 см"},
    {"Макс высота:\xa0": "72 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
    {"Ширина:\xa0": "73 см"},
    {"Глубина:\xa0": "50 см"},
    {"Высота:\xa0": "75 см"},
    {"Макс нагрузка:\xa0": "50 кг"},
]

And I want to separate this

products = [
    {
        "артикул": 12345,
        "высота": 50,
        "материал": "дерево",
    },
    {
        "артикул": 12346,
        "ширина": 30,
        "вес": 1.5,
    },
    {
        "артикул": 12347,
        "длина": 14,
        "высота": 6.2,
        "материал": "пластик",
    },
]

To get such a file as a link https://drive.google.com/file/d/1uGoW1kpsDGDA-Zh7SiiCDcg9cf2lHQUd/view?usp=sharing

Upvotes: 0

Views: 240

Answers (1)

AKX
AKX

Reputation: 169378

You don't need Pandas to write CSV to a file.

For this case, you don't need Selenium either.

import csv
import sys
from typing import List

import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"
}


def get_html(url, params=None) -> str:
    r = requests.get(url, headers=HEADERS, params=params)
    r.raise_for_status()
    return r.text


def get_specifications_parameter(url: str) -> dict:
    html = get_html(url)
    soup = BeautifulSoup(html, "html.parser")

    param = soup.find_all("dd", class_="range-revamp-product-dimensions__list-item-measure")
    titles = soup.find_all("dt", class_="range-revamp-product-dimensions__list-item-name")

    content = {}

    for title, value in zip(titles, param):
        content[title.text.strip("\xA0:")] = value.text
    return content


def scrape_urls(urls: List[str]) -> List[dict]:
    contents = []
    for url in urls:
        content = get_specifications_parameter(url)
        content["url"] = url
        print(content, file=sys.stderr)  # progress printing
        contents.append(content)
    return contents


def write_output(contents: List[dict]):
    # Figure out all keys in the content for CSV writer
    all_keys = set()
    for content in contents:
        all_keys |= set(content)
    # Write to standard output (could be a file too)
    w = csv.DictWriter(sys.stdout, all_keys)
    w.writeheader()
    for content in contents:
        w.writerow(content)


def main():
    urls = [
        "https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/",
        "https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/",
    ]
    contents = scrape_urls(urls)
    write_output(contents)


if __name__ == "__main__":
    main()

outputs

{'Ширина': '128 см', 'Глубина': '58 см', 'Мин высота': '59 см', 'Макс высота': '72 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/'}
{'Ширина': '73 см', 'Глубина': '50 см', 'Высота': '75 см', 'Макс нагрузка': '50 кг', 'url': 'https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/'}

as debug information, followed by

url,Макс нагрузка,Ширина,Высота,Мин высота,Макс высота,Глубина
https://www.ikea.com/ru/ru/p/pahl-pol-pismennyy-stol-belyy-s29278422/,50 кг,128 см,,59 см,72 см,58 см
https://www.ikea.com/ru/ru/p/micke-mikke-pismennyy-stol-belyy-20373923/,50 кг,73 см,75 см,,,50 см

Upvotes: 1

Related Questions