mr.sun
mr.sun

Reputation: 31

<img> Selector Returns Null in Headless Mode but Works in Non-Headless Mode

I'm working on a puppeteer-based scraper that extracts product details (image, title, and price) from a webpage. The scraper works perfectly in non-headless mode, but when I switch to headless mode, the .stdImg selector (used to extract the image) consistently returns null.

Here are the behaviors I observed:

In my application, I need to use headless mode because this scraper will eventually run as part of a Chrome extension, and I don't want to open a browser window during the process.

I've tried:

const { connect } = require("puppeteer-real-browser");

let browser = null;
let page = null;

//Configs
const connectOptions = {
    headless: false,
    devtools: true,
    args: [
        "--disable-features=site-per-process",
        "--no-sandbox",
        "--disable-setuid-sandbox",
        "--disable-gpu",
        "--hide-scrollbars",
        "--disable-extensions",
        "--disable-blink-features=AutomationControlled",
    ],
    customConfig: {},
    turnstile: true,
    connectOption: {},
    disableXvfb: false,
    ignoreAllFlags: false,
};

const viewPortOptions = { width: 1024, height: 768 };
const ExtraHTTPHeadersOptions = { "accept-language": "tr-TR,tr;q=0.9,en-US,en;q=0.8" };
const userAgent =
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";

//Load page
async function safeGoto(page, url, retries = 3) {
    for (let attempt = 1; attempt <= retries; attempt++) {
        try {
            await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 });
            return;
        } catch (error) {
            console.error(`Attempt ${attempt} failed:`, error.message);
            if (attempt === retries) throw error;
        }
    }
}

//Extract data
async function safeEvaluate(page, selector, evaluateFn, errorMessage) {
    try {
        const element = await page.waitForSelector(selector, { timeout: 10000 });
        return await element.evaluate(evaluateFn);
    } catch (error) {
        console.error(errorMessage, error.message);
        return null;
    }
}

async function scrapeWebsite(url, selectors) {
    try {
        const { browser: connectedBrowser, page: connectedPage } = await connect(connectOptions);
        browser = connectedBrowser;

        const [page] = await browser.pages();
        await page.setUserAgent("");
        await page.setViewport(viewPortOptions);
        await page.setExtraHTTPHeaders(ExtraHTTPHeadersOptions);

        await page.waitForSelector("body", { timeout: 10000 });
        await safeGoto(page, url);
        await page.evaluate(() => {
            const imgs = document.querySelectorAll("img");
            imgs.forEach((img) => img.scrollIntoView({ behavior: "smooth", block: "center" }));
        });

        // Extract data
        const src = await safeEvaluate(
            page,
            selectors.image,
            (el) => el.src,
            `Failed to extract image from selector: ${selectors.image}`
        );

        const title = await safeEvaluate(
            page,
            selectors.title,
            (el) => el.textContent.trim().toLowerCase(),
            `Failed to extract title from selector: ${selectors.title}`
        );

        const value = await safeEvaluate(
            page,
            selectors.price,
            (el) => el.textContent.trim(),
            `Failed to extract price from selector: ${selectors.price}`
        );

        // Return extracted data
        const color = selectors.color;
        return { src, value, title, color, url };
    } catch (error) {
        console.error("Scraping failed:", error.message);
        throw error;
    } finally {
        if (page && !page.isClosed()) {
            try {
                await page.close();
            } catch (closeError) {
                console.error("Failed to close page:", closeError.message);
            }
        }
        if (browser && browser.isConnected()) {
            try {
                await browser.close();
            } catch (closeError) {
                console.error("Failed to close browser:", closeError.message);
            }
        }
        if (browser && browser.process() != null) browser.process().kill("SIGINT");
    }
}

const getFromDomain = (url) =>
    scrapeWebsite(url, {
        image: ".stdImg",
        title: ".classifiedDetailTitle h1",
        price: ".classified-price-wrapper",
        color: "#FFE800",
    });

.stdImg for the image .classifiedDetailTitle h1 for the title .classified-price-wrapper for the price

Upvotes: 0

Views: 44

Answers (0)

Related Questions