Reputation: 31
I'm working on a puppeteer-based scraper that extracts product details (image, title, and price) from a webpage. The scraper works perfectly in non-headless mode, but when I switch to headless mode, the .stdImg selector (used to extract the image) consistently returns null.
Here are the behaviors I observed:
In my application, I need to use headless mode because this scraper will eventually run as part of a Chrome extension, and I don't want to open a browser window during the process.
I've tried:
const { connect } = require("puppeteer-real-browser");
let browser = null;
let page = null;
//Configs
const connectOptions = {
headless: false,
devtools: true,
args: [
"--disable-features=site-per-process",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-gpu",
"--hide-scrollbars",
"--disable-extensions",
"--disable-blink-features=AutomationControlled",
],
customConfig: {},
turnstile: true,
connectOption: {},
disableXvfb: false,
ignoreAllFlags: false,
};
const viewPortOptions = { width: 1024, height: 768 };
const ExtraHTTPHeadersOptions = { "accept-language": "tr-TR,tr;q=0.9,en-US,en;q=0.8" };
const userAgent =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
//Load page
async function safeGoto(page, url, retries = 3) {
for (let attempt = 1; attempt <= retries; attempt++) {
try {
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 });
return;
} catch (error) {
console.error(`Attempt ${attempt} failed:`, error.message);
if (attempt === retries) throw error;
}
}
}
//Extract data
async function safeEvaluate(page, selector, evaluateFn, errorMessage) {
try {
const element = await page.waitForSelector(selector, { timeout: 10000 });
return await element.evaluate(evaluateFn);
} catch (error) {
console.error(errorMessage, error.message);
return null;
}
}
async function scrapeWebsite(url, selectors) {
try {
const { browser: connectedBrowser, page: connectedPage } = await connect(connectOptions);
browser = connectedBrowser;
const [page] = await browser.pages();
await page.setUserAgent("");
await page.setViewport(viewPortOptions);
await page.setExtraHTTPHeaders(ExtraHTTPHeadersOptions);
await page.waitForSelector("body", { timeout: 10000 });
await safeGoto(page, url);
await page.evaluate(() => {
const imgs = document.querySelectorAll("img");
imgs.forEach((img) => img.scrollIntoView({ behavior: "smooth", block: "center" }));
});
// Extract data
const src = await safeEvaluate(
page,
selectors.image,
(el) => el.src,
`Failed to extract image from selector: ${selectors.image}`
);
const title = await safeEvaluate(
page,
selectors.title,
(el) => el.textContent.trim().toLowerCase(),
`Failed to extract title from selector: ${selectors.title}`
);
const value = await safeEvaluate(
page,
selectors.price,
(el) => el.textContent.trim(),
`Failed to extract price from selector: ${selectors.price}`
);
// Return extracted data
const color = selectors.color;
return { src, value, title, color, url };
} catch (error) {
console.error("Scraping failed:", error.message);
throw error;
} finally {
if (page && !page.isClosed()) {
try {
await page.close();
} catch (closeError) {
console.error("Failed to close page:", closeError.message);
}
}
if (browser && browser.isConnected()) {
try {
await browser.close();
} catch (closeError) {
console.error("Failed to close browser:", closeError.message);
}
}
if (browser && browser.process() != null) browser.process().kill("SIGINT");
}
}
const getFromDomain = (url) =>
scrapeWebsite(url, {
image: ".stdImg",
title: ".classifiedDetailTitle h1",
price: ".classified-price-wrapper",
color: "#FFE800",
});
.stdImg
for the image
.classifiedDetailTitle h1
for the title
.classified-price-wrapper
for the price
Upvotes: 0
Views: 44