Reputation: 1
I'm crawling a large number of URLs (20,000), in order to scrape some html from each page. However my code is slow because urlfetchapp is wrapped within a for loop. To overcome the 30 minute execution limit, I thought about using urlfetchapp.fetchAll to pass over 100 urls at a time in order to speed up execution and then create an array of the returned html code and then lookup the substrings within them.
The function urlReferrer is the initial code, and test (below) is what I am working on in order to speed up the run time
However (in function - test) urlFetchApp.fetchAll doesn't appear to return an array of getContentText for each url, it only returns getContentText for the first url in the range. Any idea on how to fix this?
function urlReferrer() {
var urlFetchOptions = {muteHttpExceptions: true};
var ss = SpreadsheetApp.getActiveSpreadsheet();
var importSheet = ss.getSheetByName("deduplicated");
var importSheetLastRow = importSheet.getLastRow();
var importRange = importSheet.getRange(2,1,importSheetLastRow-1,1).getValues();
// crawls urls
for(var z = 0; z < importRange.length; z++) {
try {
var response = UrlFetchApp.fetch(importRange[z],urlFetchOptions);
}
catch(e) {
Logger.log(e);
}
// if response code is equal to 200
if (response.getResponseCode() == 200) {
var html = response.getContentText();
}
// get first string. Split string if true, else continue
var string1 = html.split('@type":"ListItem","position":2,"item":{"@type":"Thing","@id":"')[1];
if( string1 == null) {
categoryURLs.push(["invalid html"]);
continue;
}
// Logger.log(string1);
var string2 = string1.split('</script><!-- HTML_TAG_END -->')[0];
// split the string depending on whether there are 3 or 4 breadcrumbs
// if contains position 5, then 4, then 3, then 2
if (string2.indexOf("\x22position\x22:6") >= 0) {
// Logger.log("yes");
var subString = string2.substring(
string2.indexOf('"position":6,"item":{"@type":"Thing","@id":"') + 44,
string2.lastIndexOf('","name'));
// Logger.log(subString);
categoryURLs.push([subString]);
} else if (string2.indexOf("\x22position\x22:5") >= 0) {
// Logger.log("yes");
var subString = string2.substring(
string2.indexOf('"position":5,"item":{"@type":"Thing","@id":"') + 44,
string2.lastIndexOf('","name'));
// Logger.log(subString);
categoryURLs.push([subString]);
} else if (string2.indexOf("\x22position\x22:4") >= 0) {
// Logger.log("yes");
var subString = string2.substring(
string2.indexOf('"position":4,"item":{"@type":"Thing","@id":"') + 44,
string2.lastIndexOf('","name'));
// Logger.log(subString);
categoryURLs.push([subString]);
} else if (string2.indexOf("\x22position\x22:3") >= 0) {
// Logger.log("yes");
var subString = string2.substring(
string2.indexOf('"position":3,"item":{"@type":"Thing","@id":"') + 44,
string2.lastIndexOf('","name'));
// Logger.log(subString);
categoryURLs.push([subString]);
} else {
var subString = string2.substring(
string2.indexOf('"http"'),
string2.lastIndexOf('","name'));
// Logger.log(subString);
categoryURLs.push([subString]);
}
}
Logger.log(categoryURLs);
var range = importSheet.getRange(2, 2, categoryURLs.length,1);
range.setValues(categoryURLs);
}
function test () {
var ss = SpreadsheetApp.getActiveSpreadsheet();
var importSheet = ss.getSheetByName("deduplicated");
var importSheetLastRow = importSheet.getLastRow();
var importRange = importSheet.getRange(2,1,importSheetLastRow-1,1).getValues();
var merged = [].concat.apply([], importRange);
console.log(merged);
// fetch all urls
var responses = UrlFetchApp.fetchAll(merged);
var responseMapped = JSON.stringify(responses.map(function(e) {return e.getContentText()}));
console.log(responseMapped[0]);
}
Upvotes: 0
Views: 236
Reputation: 14537
As a guess. Try to change this line:
var responseMapped = JSON.stringify(responses.map(function(e) {return e.getContentText()}));
with:
var responseMapped = responses.map(e => e.getContentText());
JSON.stringify
returns a string. But you want to get an array. So I think you don't want to convert the array into a string.
Upvotes: 2