Jake Guy
Jake Guy

Reputation: 11

Counting Words Between Two Variable Strings

Total newbie + first time poster here with very little experience though I feel this problem is one I could solve with the help of some generous strangers.

I am querying a GDoc and attempting to create a function to count words between two strings for two possible end strings, for example:

Example #1

Definitive Title

*Count these words*

===============

OR Example #2

Definitive Title

*Count these words*

Other words that are in a table

Definitive Title
    
*Count these other different words*
    
===============

In both of the above examples I looking to count the words between a pre-defined string and an end string. If I ran the function that I am trying to create on Example #1 I am hoping it'd return 3 words. For Example #2 I'd hope that my function returns 8 words.

So far my function looks like this:

function doPost(e) {
  var docUrl = e.parameter.docUrl
  var text = DocumentApp.openByUrl(docUrl).getBody().getText()
  var wordCount = text.split(" ").length
  return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
}

This returns a word count for the entire document. Any advice to point me in the right direction?

Upvotes: 1

Views: 105

Answers (4)

xGeo
xGeo

Reputation: 2139

As what @Rishabh K said in his answer, you should definitely want to replace trailing spaces and multiple spaces to avoid inaccurate results.

However on the other hand, I don't think it answers the OP's question. Correct me if I'm wrong but I think this is what you want:

var sample1 = `This is the start identifier

These words should be included

As well As these ones

Even this

Until it ends
now
Ending identifier

These words shouldn't be included

If any of these appears, the logic is wrong`;

var sample2 = sample1 + `

This is the start identifier
    
These some few words

should also be included in the result set

Ending identifier`;

var sample3 = sample2 + `
This is the start identifier

Although we have the start identifier above
These words shouldn't be included

because there is no corresponding end identifier`;

function getWordDiffBetween(source, str1, str2) {
  // make sure newSource, str1 and str2 are all strings
  var args = Array.prototype.slice.call(arguments);
  args.forEach(function(str, idx) {
    if (typeof str !== 'string') {
      throw `Argument ${[idx + 1]} is not a string.`;
    }
  });

  var startId = '<==start==>',
    endId = '<==end==>';

  var newSource = source.replace(new RegExp(str1, 'g'), startId) // replace the start identifier with our own
    .replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own
    .replace(/(^\s*)|(\s*$)/gi, "") // remove the start and end spaces of the string (like trim ())
    .replace(/\s+/g, ' ') //replace all 1 or more spaces/newline/linefeed with a single space

  //separate text into words which are separated by a space since we replaced all newlines with space
  var words = newSource.split(' ');
  // get the indexes where the start and end identifiers occured
  var strOneIdx = getAllIndexes(words, startId, true);
  var strTwoIdx = getAllIndexes(words, endId, true);

  var results = [], // we will store our results here
    i;
  for (i = 0; i < strOneIdx.length; i++) {
    var idxOne = strOneIdx[i]; // current index for str1
    var idxTwo = strTwoIdx.find(x => x > idxOne);
    //make sure that idxOne has a partner
    if (idxTwo) {
      var wordsInBetween = words.slice(idxOne + 1, idxTwo); //get range between idxOne and idxTwo

      results = results.concat(wordsInBetween); // add the result
    }
  }
  return results;
}

function getAllIndexes(arr, val) {
  var indexes = [],
    i;
  for (i = 0; i < arr.length; i++) {
    if (arr[i] === val) {
      indexes.push(i);
    }
  }
  return indexes;
}
var startIdentifier = 'This is the start identifier',
  endIdentifier = 'Ending identifier',
  wordResults = {
    sample1: getWordDiffBetween(sample1, startIdentifier, endIdentifier),
    sample2: getWordDiffBetween(sample2, startIdentifier, endIdentifier),
    sample3: getWordDiffBetween(sample3, startIdentifier, endIdentifier) //should be equal to sample2
  };

console.log(wordResults);

We have 2 functions - getWordDiffBetween and getAllIndexes. For explanation, check the comments I added in noteworthy lines.

Edit (updated snippet above):

It seems like you also want "====================" included as your end identifier. This can be done by changing the code:

.replace(new RegExp(str2, 'g'), endId) // replace the end identifier with our own

into

.replace(new RegExp(str2 + '|={2,}', 'g'), endId) // replace the end identifier with our own

which means match occurence of your <end string> or if there is 2 or more occurences of =. You can also change the number 2 in {2,} to your desired count.

Upvotes: 0

Jake Guy
Jake Guy

Reputation: 11

The below code seems to have worked! Was able to sit down with someone and solve it with them:

function doPost(e) {
      var docUrl = e.parameter.docUrl
      /*
      var text = DocumentApp.openByUrl(docUrl).getBody().getText()
      var wordCount = text.split(" ").length
      */
      var wordCount = countScenario2(docUrl);
      return ContentService.createTextOutput(wordCount.toString()).setMimeType(ContentService.MimeType.TEXT)
    }



/**
 * Count the words from Start Test to a table or ====
 */
function countScenario2(docUrl) {
  //var docUrl = 'https://docs.google.com/document/d/';
  var doc = DocumentApp.openByUrl(docUrl);
  var body = doc.getBody();
  var reference = body.findText('Start Text');
  var start = getIndex('Start Text', body);

  var tables = body.getTables();

  var count = 0;

  for(var j = 1; j < tables.length ; j ++) {
    var end = body.getChildIndex(tables[j]);

    
    for (var i = start + 1; i < end; i++) {
      var element = body.getChild(i);
      var text = element.getText();
      //if(text.length > 0)  count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
      var match = text.match(/\b(\w+)\b/g);
      count += (match) ? match.length : 0;
    }
    console.log(count);
    var reference = body.findText('Start Text', reference);
    var element = reference.getElement();
    var start = body.getChildIndex(element.getParent());
    
  }

    var end = getIndex('=========================================================', body);
  
    for (var i = start + 1; i < end; i++) {
      var element = body.getChild(i);
      var text = element.getText();
      //if(text.length > 0)  count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
      var match = text.match(/\b(\w+)\b/g);
      count += (match) ? match.length : 0;
    }
    console.log(count);
    


  return count ;
}


/**
 *  This will return the index of the element
 * 
 * @param {string} keyword The text to be found
 * @param {Body} body This is the body of the document
 */
function getIndex(keyword, body, previous) {
  var reference = body.findText(keyword, previous);
  var element = reference.getElement();
  return body.getChildIndex(element.getParent());
}

/************ */


  function testPost(){
  var e = {parameter:{docUrl:'https://docs.google.com/document/d/'}};
  var result = doPost(e);
  console.log(JSON.stringify(result.getContent()));}


/**
 * Count the words from Start Text to ====
 */
function countScenario1(docUrl) {
  //var docUrl = 'https://docs.google.com/document/d/';
  var doc = DocumentApp.openByUrl(docUrl);
  var body = doc.getBody();
  var start = getIndex('Start Text', body);

  var end = getIndex('=========================================================', body);
  var count = 0;
  for (var i = start + 1; i < end; i++) {
    var element = body.getChild(i);
    var text = element.getText();
    //if(text.length > 0)  count += text.split(" ").filter(word => word !== ' ' && word !== '' && word !== ' ').length;
    var match = text.match(/\b(\w+)\b/g);
    count += (match) ? match.length : 0;
  }
  console.log(count);
  return count;
}

function test(){
  var docUrl = 'https://docs.google.com/document/d/';
  var wordCount = countScenario2(docUrl);
  console.log(wordCount);
}

Upvotes: 0

Caleb Gross
Caleb Gross

Reputation: 385

Here is a solution to your problem you can log the difference of characters and words or you can log the total amount of words or characters in the two sentaces. You are also going to want to put the bigger sentence on top, otherwise it will give you a negative number.

var x = "count these words";
var y = "count words";

function findCharDif(word1, word2) {
    var word1length = word1.length;
    var word2length = word2.length;

    var difference = word1length - word2length;
    var total = word1length + word2length;

    console.log(difference);
    console.log(total);
}

function findWordDif(sentence1, sentence2) {

    var words1 = 0;
    var words2 = 0;

    for (var i = 0; i < sentence1.length; i++) {
        if (sentence1[i] == " ") {
            words1++;
        } else {
            continue
        }
    }

    for (var a = 0; a < sentence2.length; a++) {
        if (sentence2[a] == " ") {
            words2++;
        } else {
            continue
        }
    }

    var difference = (words1 + 1) - (words2 + 1); // this logs out the difference of words between the sentences
    var totalWords = (words1 + 1) + (words2 + 1); // this logs out the total amount of words

    console.log(difference);
    console.log(totalWords);
}

findCharDif(x, y);

findWordDif(x, y);


 

Upvotes: 0

Rishabh K
Rishabh K

Reputation: 21

For more dynamic, appropriate and accurate solution, execute the following snippets before the split () function. Regular Expressions often used to provide dynamic solutions. It is a must have skill.

text = text.replace(/(^\s*)|(\s*$)/gi,"");  // remove the start and end spaces of the string (like trim ())
text = text.replace(/[ ]{2,}/gi," ");  // filter out one or more spaces
text = text.replace(/\n /,"\n");  // filter out news lines with spacing at beginning
wordCount = text.split(" ").length;

Upvotes: 2

Related Questions