Max
Max

Reputation: 442

Using cheerio to scrape data from links extracted using cheerio

Are using cheerio and nodejs to get data from the allegro website to create endpoints in an API that gives back csv data this data will be studied later on as part of a data science project:

https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605

to get the cars information I managed to scrape off the links from the first page each link sends you to the car (item of cars) to see the full information of the car I need to scrape more data from each link how do I do that?

and how to i make the json data shows off as csv instead ?

here the code used :

const url =
  "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";

//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);

function getCars() {
  return fetch(`${url}`)
    .then((response) => response.text())
    .then((body) => {
      const cars = [];
      const $ = cheerio.load(body);
      $("._9c44d_2H7Kt").each(function (i, el) {
        const $price = $(el).find("._9c44d_1zemI");
        const $link = $(el).find("a");
        const $year = $(el).find("dd");
        const $make = $(el).find("h2");

        const car = {
          price: $price.text().replace(/\s\s+/g, ""),
          link: $link.attr("href"),
          year: $year.first().next().next().text(),
          make: $make.text(),
        };
        cars.push(car);
      });

      // Write Row to CSV
      // writeStream.write(`${price},${link} \n`);
      return cars;
    });
}

the code used for the nodejs endpoint :

app.get("/scraping/:allegro", (req, res) => {
  scraper.getCars(req.param.allegro).then((cars) => {
    //console.log(cars);
    res.json(cars);
  });

The data to get from each link is the following : date added,model,phone number, city,vin

Upvotes: 1

Views: 568

Answers (1)

Bertrand Martel
Bertrand Martel

Reputation: 45362

There is a convenient thing about these pages, it's that you can return the data in JSON instead of html by just setting the media type to application/json eg setting the Accept header.

For instance to get the list :

curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
     -H "Accept: application/json"

To get a specific item :

curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"

So you don't need to use webscraping tools just parsing JSON. The pagination is done by adding a query param &p=PAGE_NUM which is convenient too

I've made a small example in that can be easily ported to JS. It request the list of cars, then request the first element :

import requests 
import json
import pandas as pd

r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
    headers = {
        "Accept": "application/json"
    },
    params = {
        "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
        "order":"dd"
    })
data = [{
        "name": t["name"],
        "url": t["url"],
        "price": t["sellingMode"]["advertisement"]["price"]["amount"],
        **dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
    }
    for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)

print("get data for first element")
r = requests.get(data[0]["url"],
    headers = {
        "Accept": "application/json"
    })
item = r.json()
item_data = {
    "phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
    "delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
    "startingAt": item["summary"]["offer"]["publication"]["startingAt"],
    "endingAt": item["summary"]["offer"]["publication"]["endingAt"],
    **dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}

print(item_data)

An implementation in using axios :

const axios = require("axios");

async function process() {
    let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
        query: {
            "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
            "order":"dd"
        },
        responseType: "json"
    });
    let promoted = response.data["pagination bottom"].collection.items.promoted;
    list = [];
    for (var i = 0; i < promoted.length;i++) {
        let item = {
            name: promoted[i].name,
            url: promoted[i].url,
            price: promoted[i].sellingMode.advertisement.price.amount,
        };
        let params = promoted[i].parameters;
        for (var j = 0; j < params.length;j++){
            item[params[j].name] = params[j].values[0];
        }
        list.push(item);
    }
    console.log(list);
    console.log("fetching : " + list[0].url);
    response = await axios.get(list[0].url,{
        responseType: "json"
    });
    let entryData = response.data;
    let entry = {
        phone: entryData.summary.offer.contact.phones[0].number,
        delivery: entryData.summary.offer.delivery.summary[0].value.text,
        startingAt: entryData.summary.offer.publication.startingAt,
        endingAt: entryData.summary.offer.publication.endingAt
    };
    let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
    for (var i = 0; i < parameters.length;i++) {
        entry[parameters[i].name] = parameters[i].values[0].valueLabel
    }
    console.log(entry);
}

process();

Upvotes: 1

Related Questions