Reputation: 442
Are using cheerio and nodejs to get data from the allegro website to create endpoints in an API that gives back csv data this data will be studied later on as part of a data science project:
to get the cars information I managed to scrape off the links from the first page each link sends you to the car (item of cars) to see the full information of the car I need to scrape more data from each link how do I do that?
and how to i make the json data shows off as csv instead ?
here the code used :
const url =
"https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";
//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);
function getCars() {
return fetch(`${url}`)
.then((response) => response.text())
.then((body) => {
const cars = [];
const $ = cheerio.load(body);
$("._9c44d_2H7Kt").each(function (i, el) {
const $price = $(el).find("._9c44d_1zemI");
const $link = $(el).find("a");
const $year = $(el).find("dd");
const $make = $(el).find("h2");
const car = {
price: $price.text().replace(/\s\s+/g, ""),
link: $link.attr("href"),
year: $year.first().next().next().text(),
make: $make.text(),
};
cars.push(car);
});
// Write Row to CSV
// writeStream.write(`${price},${link} \n`);
return cars;
});
}
the code used for the nodejs endpoint :
app.get("/scraping/:allegro", (req, res) => {
scraper.getCars(req.param.allegro).then((cars) => {
//console.log(cars);
res.json(cars);
});
The data to get from each link is the following : date added,model,phone number, city,vin
Upvotes: 1
Views: 568
Reputation: 45362
There is a convenient thing about these pages, it's that you can return the data in JSON instead of html by just setting the media type to application/json
eg setting the Accept
header.
For instance to get the list :
curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
-H "Accept: application/json"
To get a specific item :
curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"
So you don't need to use webscraping tools just parsing JSON. The pagination is done by adding a query param &p=PAGE_NUM
which is convenient too
I've made a small example in python that can be easily ported to JS. It request the list of cars, then request the first element :
import requests
import json
import pandas as pd
r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
headers = {
"Accept": "application/json"
},
params = {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
})
data = [{
"name": t["name"],
"url": t["url"],
"price": t["sellingMode"]["advertisement"]["price"]["amount"],
**dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
}
for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)
print("get data for first element")
r = requests.get(data[0]["url"],
headers = {
"Accept": "application/json"
})
item = r.json()
item_data = {
"phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
"delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
"startingAt": item["summary"]["offer"]["publication"]["startingAt"],
"endingAt": item["summary"]["offer"]["publication"]["endingAt"],
**dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}
print(item_data)
An implementation in nodejs using axios :
const axios = require("axios");
async function process() {
let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
query: {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
},
responseType: "json"
});
let promoted = response.data["pagination bottom"].collection.items.promoted;
list = [];
for (var i = 0; i < promoted.length;i++) {
let item = {
name: promoted[i].name,
url: promoted[i].url,
price: promoted[i].sellingMode.advertisement.price.amount,
};
let params = promoted[i].parameters;
for (var j = 0; j < params.length;j++){
item[params[j].name] = params[j].values[0];
}
list.push(item);
}
console.log(list);
console.log("fetching : " + list[0].url);
response = await axios.get(list[0].url,{
responseType: "json"
});
let entryData = response.data;
let entry = {
phone: entryData.summary.offer.contact.phones[0].number,
delivery: entryData.summary.offer.delivery.summary[0].value.text,
startingAt: entryData.summary.offer.publication.startingAt,
endingAt: entryData.summary.offer.publication.endingAt
};
let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
for (var i = 0; i < parameters.length;i++) {
entry[parameters[i].name] = parameters[i].values[0].valueLabel
}
console.log(entry);
}
process();
Upvotes: 1