Jimmy
Jimmy

Reputation: 71

How to scrape google news results in puppeteer JS?

I am currently working on scraping Google News pages. I am trying to scrape these pages with puppeteer but when I try to scrape it always returns me an empty result.

Here is my code:

const puppeteer = require('puppeteer')
const cheerio = require('cheerio')

const getNewsData = async (query) => {
  let title = [] , url = [] , snippet = [] , imgSrc = [] , lastUpdated = [] , source = []; 
  const browser = await puppeteer.connect({
    browserWSEndpoint: `wss://chrome-us.browsercloud.io?token=hided`,
});
    const page = await browser.newPage();


try {
  await page.goto("https://www.google.com/search?q="+query+"&tbm=nws&gl=us")
  const elmHandle = await page.$("div.iRPxbe > div.mCBkyc");

  title.push(elmHandle.textContent)
  
  await browser.close();
  console.log(title);
} catch (error) {
  console.log("Error : " +error)
}
return [];
// Remember to catch errors and close!
};

getNewsData("football");

Please also help me to scrape news source, thumbnail and date.

Upvotes: 1

Views: 1177

Answers (2)

Darshan
Darshan

Reputation: 122

Check this answer, to get Google News Results:

const unirest = require("unirest");
const cheerio = require("cheerio");

const getNewsData = () => {
  return unirest
    .get("https://www.google.com/search?q=football&gl=us&tbm=nws")
    .headers({
      "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
    })
    .then((response) => {
      let $ = cheerio.load(response.body);

      let news_results = []; 

  $(".BGxR7d").each((i,el) => {
    news_results.push({
     link: $(el).find("a").attr('href'),
     title: $(el).find("div.mCBkyc").text(),
     snippet: $(el).find(".GI74Re").text(),
     date: $(el).find(".ZE0LJd span").text(),
     thumbnail: $(el).find(".NUnG9d img").attr("src")
    })
  })
  
console.log(news_results)
});
};

getNewsData();

If you need an explanation of this code, I have written a blog also on how to scrape Google News Results: https://serpdog.io/blog/web-scraping-google-news-results-with-node-js.html

Alternative:

You can use Google News API by Serpdog. Serpdog also offers 100 free credits on the first signup.

Scraping can be time-consuming sometimes, but you can use this pre-cooked structured JSON data which makes your work easier and also you don't have to maintain the Google CSS selectors from time to time which is a big headache.

How to use:

const axios = require('axios');

axios.get('https://api.serpdog.io/news?api_key=APIKEY&q=football&gl=us')
  .then(response => {
    console.log(response.data);
  })
  .catch(error => {
    console.log(error);
  });

Results:

"news_results": [
{
  "title": "Martin Bengtsson: football’s Swedish wonderkid whose dream died at Inter",
  "snippet": "If Martin Bengtsson feels stressed he kicks a football around on his own and, almost immediately, the tension begins to ebb away.",
  "source": "The Guardian",
  "imgSrc": "data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==",
  "lastUpdated": "3 hours ago",
  "rank": "1"
},
.....

Disclaimer: I am the founder of serpdog.io

Upvotes: 0

Mikhail Zub
Mikhail Zub

Reputation: 474

You don't need any browser automation to get your information because it can get from a simple request, which needs fewer resources to do this. Check how to do this in the online IDE:

const cheerio = require("cheerio");
const axios = require("axios");

const searchString = "football";                     // what we want to search
const encodedString = encodeURI(searchString);      // what we want to search for in a browser-friendly language

const AXIOS_OPTIONS = {
    headers: {
        "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
    },                                                  // adding the User-Agent header as one way to prevent the request from being blocked
    params: {
        q: encodedString,                                // our encoded search string        
        tbm: "nws",                                     // parameter defines the type of search you want to do ("nws" means news)
        hl: 'en',                                       // Parameter defines the language to use for the Google search
        gl: 'us'                                        // parameter defines the country to use for the Google search
    },
};

function getNewsInfo() {
    return axios
        .get(`http://google.com/search`, AXIOS_OPTIONS)
        .then(function ({ data }) {
            let $ = cheerio.load(data);

            const pattern = /s='(?<img>[^']+)';\w+\s\w+=\['(?<id>\w+_\d+)'];/gm;
            const images = [...data.matchAll(pattern)].map(({ groups }) => ({ id: groups.id, img: groups.img.replace('\\x3d', '') }))

            const allNewsInfo = Array.from($('.WlydOe')).map((el) => {
                return {
                    link: $(el).attr('href'),
                    source: $(el).find('.CEMjEf span').text().trim(),
                    title: $(el).find('.mCBkyc').text().trim().replace('\n', ''),
                    snippet: $(el).find('.GI74Re').text().trim().replace('\n', ''),
                    image: images.find(({ id, img }) => id === $(el).find('.uhHOwf img').attr('id'))?.img || "No image",
                    date: $(el).find('.ZE0LJd span').text().trim(),
                }
            });

            return allNewsInfo;
        });
}

getNewsInfo();

Output:

[
   {
      "link":"https://www.cardchronicle.com/2022/7/11/23077819/madden-sanker-commits-to-louisville-football",
      "source":"Card Chronicle",
      "title":"Madden Sanker Commits to Louisville Football",
      "snippet":"Louisville lands their highest rated offensive line recruit in program history.",
      "image":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBwgHBgkIBwgKCgkLDRYPDQwMDRsUFRAWIB0iIiAdHx8kKDQsJCYxJx8fLT0tMTU3Ojo6Iys/RD84QzQ5OjcBCgoKDQwNGg8PGjclHyU3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3N//AABEIAFwAXAMBIgACEQEDEQH/xAAbAAACAwEBAQAAAAAAAAAAAAAEBQMGBwIBAP/EADoQAAEDAwIEBAUBBgUFAAAAAAECAwQFESEAEgYTMUEiUWFxFDKBkaEHFSMkQlLRkrHh8PFyorLB0v/EABkBAAMBAQEAAAAAAAAAAAAAAAIDBAABBf/EACURAAICAQQBAwUAAAAAAAAAAAABAhEDEiExUQQTQaEiYXHh8P/aAAwDAQACEQMRAD8AoXD1BqCnnlQWVLaEcrcCVblOWKeiRnG78HViic5+W0wlt74nbsS1y1XP9vroCVXXJVSalLLqH2idjsdQSsC+CbAi48hjscaPd43rUmK60Zm8upKN/LAUkH+kpHX/AHbS5Y1LkohncE0iqR6HUCqoiOuMuQ04VKSHwlYIPUA9vW+n9Ghl4uu1ea5EhpshxxTgT8wvYX6m2LDPiB6DXfAcFVSrpihlaQ4lan5G4bmmR8wAKbZO0Dpk6vnF/A8LiKK0qEkQpcRrlxUhR5O0dEFPa/8AUM3yb6J473QMcj2+xjIkcpKgslY3EIXawUm/Ujt276lSl+VHfeTlDYu4oqAtf3630vqTUmJOehyUKbdZWW3UHqhQNiNXDgqitVGMFuUOdNVuV+8uG27Y6KKhe1v+4+Q0l402Xw8qah9Qo4RckQ61GKGkPx0ub1MunwIwQVHNuh6/TTenS6C/WZPIirj0xccoDTrouy4T8yO5tm1/Lti0/FlPiUiIlyXw3KiEkpSeZubWf5fGFkDyI6+Wqvw7DlS1OtxEJUtKQpalWGPLOjp1uTZJY+YIb13hiOlTIfqTUTegqTzmSkrT2IyL6KovDsOHEmxX6kh9b6CFIAsEXGFWvfv/AJasdNjLqlFco3ECUJSyCuBNISvlH+gjPhOq5SKFUYyqsp/et2SwpCVlwK3KsbZ6/fRJkrBGOE0MMK5dRYWsjwrWggD10OOFHbZrMQ+ytL1cOcUgbAxJKBgD4gWt/i0VTqFWGGVJkQ3N5VfLiTiw9dZpLeh0Mkm0m6RxKIacIdBUq5sFC1/9NetPkqBcIKB1SlW0Eaf08PT2RGrzK5AQSG1c1PMSP+onTqDwnTYcM1uS8Vw2iUtxHACp53sFKHVPcgeXlfQrKvcH0W+A/wDTcRosV51uOGHpqgUgqPiaT0IvmxUVZ72Hlqz8W1GoQeGZUyksIdkMJ3qSeuzuR6gZ+msin1SpsV5mfA3SJJNi2hJVvvYbbDtgAAdLDWx0OoCS2lmU18PJ2AuR1qClI97YOmYsqnEKeFwddGD02BU+LKsOSwhSzdTy0AIFr5UpR6qz1661quPVGisRVU4yFoa5TXJbZ2pUSBm2Li4INsi4GNK6jSTwOqUlmEmTSKg6Qp1zdaO2r5myE5GL7SOvQ9NaBGlMQ6W2826pbCGgULLm8uADHiJ8RPqc6FcmmnS6IKI0mtcPusVduPIS6pbbyEIISTfxjJ67rjFrEG2sPlQnaLWqhTWHl82O8WkqGVLTfw3HmRb763hFfirjpecJbT3ByR7gZGsK4xbqjfEsitux3I0eXMKo61kDdttY26/y99ZgJMkiLqamlB7n7km2WrY+2n78oVKjtD4UsTYiFBxxDe0PIsNt8ZUM6Vpf4pVFS4mPIKyOvw97/jRER/iRxhxT8WQFJQSndHtn7aBVfJnfQn+HrjkbnMolFAG4XXa49ATkY1O026+2l1c+Q2pQ8SCtWD9jokzOKAT/AAL31inQC61xAVqCYrq9p2qKYqjYjsbd9C43ww4y08odFxwkq3C6TcG1tc1LiF2UpqnukIjspG1s5uq2VfU6rKajk3cmX7ZB/wDerJw7+wJdNqEqtIkSXGWjtS45y+VbIUFpNxfA6HN8aF43JUMhkUXdElMNHZD8l5u1RbQTFc62VbpYdb9M+eq41VFqlqmMylpkoVdK0natB8vb005rFAREhQp8GW4/HktIebS6gJWhKhcXIwT9B01WKpPROQ0G4iG3E/NIPzn0x299Csbjsy3F5F3pVpmm8GMq4ojtSK+W3oFNSWm2luKsV/1FIIFrdbgknyGDzUYz9ImR4PDtVQ5DfIWmJLRvDd0lYyRlBA7G4PvlBwNxPFpEB6JNyl15TityCsHwADAFhcjJ8vppjVpw4p4jp8LhtKg40jl/FXKAEAA5FrgJ8We97d9Pk1oT9xUYNZWmqh8A1Xp9ZqBNkRkpdvzGWXDZxShm+8+Q7dMeWKzUWZphGHIElaYZBRZJKGslKgTbH+h0+fqk2PUJTUGoNVJMRK7SEtpCFpSncuxIv0TbB6gWOtIkVqKxQ3H5EhD8VbSlqJQm3KUk2CUgG4yMZv3J0OOLk22znkShHGoxS391+xbRnZDdKhpeG5xMdsKzm4SL30UZLySbJT37apDfGEMI8VRnBQthMZOfPtrw8Vstbi7UZbiHFbmtkdBsnAzjGQcaU1LoR9PZczJev0Tb66r0/iCXDluMNJZ2pPdPW+dAo4pgLur9ryUHyXGT/wDOkM+rMPy3HEywsG3iUixOPK2stRm4i5iC8pD6Sw4XQgctAQSSSodAOuL6dUNl9mLKZmQ22GJDJb5zyChN8/PnpYkXt30FS6gYVKkttuKbdlLCN4J+UC5APa5OfPQ01M2XDWVl5bAV43M7Untc9PLRyy1KirD4WrH6je5q9HhwJfAVKblx96kQU2WkG4UBbrrE5TXKfcS2bgKIsda/wXNKv09Qp1XNcjIeacyDneop/Ck6yep7TMcKbgE30eV20bxYVCV9nEUXueh8r60X9PYbsaIicw2DMqM1MBouJ3JDITzXiR38KCPcDWaIUpC7pN/bVipPE1Yp9LegwpiWWFkmxaBW2SLKKCehI9/ppUajK2PzKWTD6cORhVAylEydAioYj1SQ4zDaaG1Ijt25ih2G9WzHkVarTrj/ACw04pwNJUSltRO1Jvmw6ddFy6iZlPgQw2lliE0W0NJN7lRupfT+Yj6WA1FzpKWgvnObVEpF1E9AP7jTo9nn5pOlB/3XwRx2N/O3ggJYU4Lj2sdRbHC0XA2S2nBV2GimZ0hoq/eKWFBWFZyQf+dQF97mB0ur3gWCgcgaIQQjJsBm/TTalU5qXFLriiDuIGe2g3X5TS1IW6oKHU9/v11NHqbjbe1ad5v1wNcZ1ULnnw2y2APHvUbnIIsMW094eVFn05bVQrTkKPvP8PzbBRGyytpx3Vm3YeWp/wBSqTCpleZTATaPJZ+ISAfCkqJB2+nhB+vlpHwitSeIoIUGlNqf2lLqbpINwbjv10LSsfiyzUWk9i7QJMKlUafCpk0y2ZLg3lVvAUg5TYWN8A+WOnTVFmupfkEpAGevnqx8OUh2r8UookBSBdUpb7im7oQi5sdtxb5UDBFrjTCZ+l9cQ+5y3oLovizqkk/cH/PQyhJu0VeP5MNDUnvd7lHEdZVYAK0UptaYu2183KFZt/v006kcCcRxiVKpinUj+ZlaHPxe/wCNLH2X46/h5LTzDiRlt5G0j6G1tLlqXJXBwlelgDBCZqi42VN7QLBVrn3z56aOKiKgtpC3QEOKO3aNxuPfpgaDUUsuBSxjJI9Oh/GpXoMhp1aOUtW1RF0pv09tOi7R4+babZ4Vxl/MytBGBsVf6m/U/bXI+E+Uqfyfm2jw/S+fuNTsQng3JK2lXDQ2WF/FuGPewONQCG+R8vjOeWcKt528tEK3J5646pS3hucKwFBFrJFwOpvfQp+Ev0f9gpOPxr74WRfbyXL+qcaYQoqTGQXUELN7gjIydY6txTUZDzkeGh18rDLakNBRvsRuOPa99dQnP2RVYUkBSg2GJBSDYnchKyB/iNtBTT47eSBphW0bBGG4kohsAE9bctJt9L/bQydMo8eOq/wat+kscmBWayW1IVUZpS2pR8RbTn/yWb+o9NXq9tCU6nR6TRaZAhpIZZjJtfqSrKifUkk/XRKsJxqiK2JT3cAb64kx4k9rlTozMhv+l5AUPzoZxxSeh1Ew+4p0pJxrprrgyfjyDCjcWy4cFpLTYZbdS2noDtyB7gX+uklSbO2LJ3X57CCfdKQk/kHRPF0hb3FUuSqwdMpbZIvlKTtH4A1GEpci2cTuDCiEAk4Bzb76n2vYNttADLy2t3LJAUCk29RbXBOfXRbDiZKkB1pslR27kjbYWv2xoxmHHM1mOW/CvxFVzcWI/vroNCuSlbLrjKlE8tRSc+WpY1UkxmyhCgoXv4xfRtWbTG/ikgKdcIJ35AJFzYaGDLJJ/cp7dFK7gHz9dY7R/9k",
      "date":"8 hours ago"
   },
   ...and other results
]

You can also check my blog post Web Scraping Google News with Nodejs if you want to know more about this topic.

Upvotes: 3

Related Questions