Reputation: 327
I'm fiddling around with some scraping, and need to manipulate some of the data before writing it to my json file.
var Xray = require('x-ray');
var x = Xray();
x('http://myUrl.com', '#search_results div div a', [{
title: '.responsive_search_name_combined .search_name .title',
price: '.col.search_price.responsive_secondrow',
}])
.paginate('.search_pagination_right a.pagebtn:last-child@href')
.limit(10)
.write('data.json');
When saved, price looks like this: "price": "\r\n\t\t\t\t\t\t\t\t13,99€\t\t\t\t\t\t\t".
I guess its because theres a lot of spaces in div.col.search_price.responsive_secondrow.
<div class="col search_price responsive_secondrow">
9,99€ </div>
So my question is: Would it be possible to manipulate the data before .write?
Upvotes: 1
Views: 781
Reputation:
You could use X-Ray native supported approach which is called filter
functions and completely covers the case you described.
filters
are custom defined functions allowing you to implement custom logic while processing scraped data.
See code sample below. There's a custom defined filter function with name of cleanUpText
and apply it to scraped data price
.
var Xray = require('x-ray');
var x = Xray({
filters: {
cleanUpText: function (value) { return value.replace('\r\n\t\t\t\t\t\t\t\t', '').replace('\t\t\t\t\t\t\t', ''); },
}
});
x('http://store.steampowered.com/search/?filter=topsellers', '#search_results div div a', [{
title: '.responsive_search_name_combined .search_name .title ',
price: '.col.search_price.responsive_secondrow | cleanUpText', // calling filter function 'cleanUpText'
}])
.paginate('.search_pagination_right a.pagebtn:last-child@href')
.limit(10)
.write('data.json');
data.json
looks like below:
{"title": "PLAYERUNKNOWN'S BATTLEGROUNDS",
"price": "$29.99"},
{"title": "PAYDAY 2: Ultimate Edition",
"price": "$44.98"}
Upvotes: 0
Reputation: 2061
Yes, you can simply provide a callback function that takes an object which is the result of your scrape. In this function you can take full control of any post-processing you want to do.
So your code would end up something like:
x('http://myUrl.com', '#search_results div div a', [{
title: '.responsive_search_name_combined .search_name .title',
price: '.col.search_price.responsive_secondrow',
}])
(function(products){
var cleanedProducts = [];
products.forEach(function(product){
var cleanedProduct = {};
cleanedProduct.price = product.price.trim();
//etc
cleanedProducts.push(cleanedProduct)
});
//write out results.json 'manually'
fs.writeFile('results.json', JSON.stringify(cleanedProducts));
})
Upvotes: 3