Reputation: 15
I need to be able to open and screenshot image url's for a webscraping project. My script works fine when given the initial page URL however after retrieving the desired image URL, it does not work.
Here is the function in the main script:
function ImageFetcher(pageURL, partName, urlHostName, selector) {
return new Promise( async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto(pageURL);
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('//', 'https://');
}, selector)
console.log(imageHref);
await page.close();
await page.goto(imageHref);
await page.waitForSelector("body > img");
const image = await page.$("body > img");
await image.screenshot({path: `./image-test/${partName}.png`});
await page.close();
await browser.close();
console.log(`${urlHostName.host} Image Captured`);
return resolve();
} catch(e) {console.log(`Error ${urlHostName.host}! Part Name: ${partName}`)};
});
}
var index = 0;
var array = json.Part;
async function start() {
for (let index = 0; index < 1; index++) {
const element = array[index];
try {
await urlSorter(element);
} catch(e) {console.log(`URL Sorter Error, Part Name: ${partName} ${urlHostName.host}`)};
}
} start();
Here is a more isolated test regarding the issue (this does not work either, the opened page is blank)
const puppeteer = require('puppeteer');
pageURL = "https://static.grainger.com/rp/s/is/image/Grainger/1RVB9_AS01?hei=536&wid=536";
function ImageFinder(pageURL) {
return new Promise( async (resolve, reject) => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.goto(pageURL);
await page.close();
await browser.close();
} catch(e) {console.log(`ERR`)}
})
}
ImageFinder(pageURL);
This is the result of both versions of the above code
However, if manually paste the URL in to the browser the image displays
Upvotes: 1
Views: 1180
Reputation: 13802
Your URL lacks the protocol part. It is added automatically when you paste it manually in the browser address bar, but you need to specify it explicitly with the puppeteer.
So try:
pageURL = "http://static.grainger.com/rp/s/is/image/Grainger/1RVB9_AS01?hei=536&wid=536";
Upvotes: 1