Markus W
Markus W

Reputation: 11

Using an iterator in conjunction with a scraper of multiple URLs to avoid lengthy repetitive script

Greeting all, and thanks in advance for any help.

My style of coding (not being a coder at all) is borrowing what's out there and trying to make it work for my situation.

In this case, I've borrowed the scraper for scraping price and item #, then subsequently writing it to a google sheet. It works. But after entering 25 items out of 100s, I am wondering if it is possible to iterate the process (for loop?), to avoid 1000's of line of script?

I've attached the original scraper (which works), and it also has the beginnings of changes (functions for url1, name1, price1, etc.)

function runGraingerScraperRemoveOld(){
     priceScraper();
     outWithTheOldInWithTheNew();
}

function priceScraper() {
    var url1 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('TotalList').getRange('H9').getValue();
//    const url2 = 'https://www.grainger.com/product/CONDOR-Sorbent-Boom-35ZR66' 
//    const url3 = 'https://www.grainger.com/product/EAGLE-95-gal-Yellow-Polyethylene-5PW10'
//    const url4 = 'https://www.grainger.com/product/GRAINGER-APPROVED-55-gal-Yellow-Steel-Open-Head-4GY35'
//    const url5 = 'https://www.grainger.com/product/5EET0'

    
    // parse the data, return the price     
    const price1 = getData(url1).replace("$","").replace(/\s/g, "")
//    const price2 = getData(url2).replace("$","").replace(/\s/g, "")
//    const price3 = getData(url3).replace("$","").replace(/\s/g, "")
//    const price4 = getData(url4).replace("$","").replace(/\s/g, "")
//    const price5 = getData(url5).replace("$","").replace(/\s/g, "")

    
    // reference name (Item #)
    const name1 = getItemNumber(url1);
//    var name2 = '35ZR66'
//    var name3 = '5PW10'
//    var name4 = '4GY35'
//    var name5 = '5EET0'

    
    // input data into spreadsheet
    var sheet1 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
    sheet1.appendRow([name1, new Date(), price1]);
//    var sheet2 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
//    sheet1.appendRow([name2, new Date(), price2]);
//    var sheet3 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
//    sheet1.appendRow([name3, new Date(), price3]);
//    var sheet4 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
//    sheet1.appendRow([name4, new Date(), price4]);
//    var sheet5 = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
}

function outWithTheOldInWithTheNew() {
  var sh=SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Graingerscrape');
  var eA=sh.getRange(2,1,sh.getLastRow()-1,sh.getLastColumn()).getValues();
  var tA=sh.getRange(2,1,sh.getLastRow()-1,1).getValues();
  var dA=sh.getRange(2,2,sh.getLastRow()-1,10).getValues();
  var uA=[];//ticket numbers
  var uB=[];//dates
  var uC=[];//entire rows
  for(var i=0;i<tA.length;i++) {
    var idx=uA.indexOf(tA[i][0]);
    if(idx==-1) {
      uA.push(tA[i][0]);
      uB.push(dA[i][0]);
      uC.push(eA[i]);
    }else if(new Date(dA[i][0]).valueOf() > new Date(uB[idx]).valueOf()) {
      uB.splice(idx,1,dA[i][0]);//replace newer dates
      uC.splice(idx,1,eA[i]);//replace newer rows
    }
  }
  sh.getRange(2,1,sh.getLastRow()-1,sh.getLastColumn()).clearContent();
  sh.getRange(2,1,uC.length,uC[0].length).setValues(uC);//newest rows
}

function getData(url1){
    var fromText = '<span class="pricing__price">';
    var toText = '</span>';
  
    var content = UrlFetchApp.fetch(url1).getContentText();
    var scraped = Parser
                      .data(content)
                      .from(fromText)
                      .to(toText)
                      .build();
    Logger.log(scraped);
    return scraped;
}
function getItemNumber(url1){
    var fromText = '<span class="product-detail__product-identifiers-description">';
    var toText = '</span>';
  
    var content = UrlFetchApp.fetch(url1).getContentText();
    var scraped = Parser
                      .data(content)
                      .from(fromText)
                      .to(toText)
                      .build();
    Logger.log(scraped);
    return scraped;
}
//function getData(url2){
//    var fromText = '<span class="pricing__price">';
//    var toText = '</span>';
//  
//    var content = UrlFetchApp.fetch(url2).getContentText();
//    var scraped = Parser
//                      .data(content)
//                      .from(fromText)
//                      .to(toText)
//                      .build();
//    Logger.log(scraped);
//    return scraped;
//}
//function getData(url3){
//    var fromText = '<span class="pricing__price">';
//    var toText = '</span>';
//  
//    var content = UrlFetchApp.fetch(url3).getContentText();
//    var scraped = Parser
//                      .data(content)
//                      .from(fromText)
//                      .to(toText)
//                      .build();
//    Logger.log(scraped);
//    return scraped;
//}
//function getData(url4){
//    var fromText = '<span class="pricing__price">';
//    var toText = '</span>';
//  
//    var content = UrlFetchApp.fetch(url4).getContentText();
//    var scraped = Parser
//                      .data(content)
//                      .from(fromText)
//                      .to(toText)
//                      .build();
//    Logger.log(scraped);
//    return scraped;
//}  
//function getData(url5){
//    var fromText = '<span class="pricing__price">';
//    var toText = '</span>';
//  
//    var content = UrlFetchApp.fetch(url5).getContentText();
//    var scraped = Parser
//                      .data(content)
//                      .from(fromText)
//                      .to(toText)
//                      .build();
//    Logger.log(scraped);
//    return scraped;
//}  
  

As you can see, the original code, required the input of the URL (E.G. - const url2, const url3, etc.). I have since been able to extract the URL from each hyperlink on my sheet, and am now using cell references (E.G. - var url1).

Where I am stuck is creating an iterator for the URL retrieval, as well as iterating the parsing out of the corresponding Item # - function getItemNumber(url1), and corresponding price function getData(url1).

From there (since this is all in a data table), I will use offset to input the name and price based on the corresponding URL.

Added screenshot of google sheet TotalList

Hope this is enough information?? Thanks again.

Upvotes: 1

Views: 83

Answers (1)

Emel
Emel

Reputation: 2452

By iterating on the urls column, in this case the H, you can easily reuse the code you have used in priceScrapper.

This can be achieved in many ways. I leave you an approximation.

Sample code

const loopedScraper = () => {
  const ss = SpreadsheetApp.openById(ID_SHEET)
  // Get the list of all URL, filtering it for clearing empty ones.
  const listUrl = ss.getRange('TotalList!H1:H' + ss.getLastRow())
    .getValues()
    .flat() // getValues() returns two dimension Array
    .filter(i => i != "") // clear empty ones
  // Iterate over them and reusing your code  
  for (let url of listUrl) {
    let price = getData(url).replace("$", "").replace(/\s/g, "")
    let name = getItemNumber(url)
    let newRow = [name, new Date(), price]
    ss.getSheetByName('Graingerscrape').appendRow(newRow)
  }
}

Upvotes: 0

Related Questions