petomalina
petomalina

Reputation: 2150

Getting line number from index of character in file

I have a string input which consists of words. I am using regex.exec (g) to get all the words by function getWord(input)

So my input may look like this: word word2 someword blah

What I get from from exec is object containing index of match. So it is array like: [ 'word', index: 0, input: "..."] ... [ 'someword', index: 11, input: "..."] ...

What I need is to easily calculate that word "someword" is on line 2 by using the index(11) (as I don't have any other value telling me what is the number of lines)

Here is what I came up with: Match '\n's until you match \n with higher index then is index of word. Not sure if this may not be problematic in 10k lines file.

Snippet for idea:

getLineFromIndex: (index, input) ->
  regex = /\n/g
  line = 1

  loop
    match = regex.exec(input)
    break if not match? or match.index > index

    line++

  return line

Kinda big optimalization can be done here. I can save the regex and last match, so I won't iterate all the input every time I want to check for line number. Regex will then be executed only when the last match has lower index then current index.

This is the final idea with optimization:

  ###
    @variable content [String] is input content
  ###
  getLineFromIndex: (index) ->
    @lineMatcher = @lineMatcher || /\n/g
    @lastLine = @lastLine || 1

    if @eof isnt true
      @lastMatch = @lastMatch || @lineMatcher.exec(@content)

    if @eof or index < @lastMatch.index
      return @lastLine
    else
      match = @lineMatcher.exec(@content)
      if not @eof and match is null
        @eof = true
      else
        @lastMatch = match

      @lastLine++

    return @lastLine

Upvotes: 5

Views: 3632

Answers (3)

peterjwest
peterjwest

Reputation: 4452

The original proposed solution by gwer is this:

function getLineNumber(text, index) {
  return text.slice(0, index).split('\n').length;
}

However there are faster solutions:

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\n/g);
  return (match ? match.length : 0) + 1;
}

This one is the fastest according to my limited benchmarking, and should also use the least memory since it does not manipulate the input text at all:

function getLineNumberB(text, index) {
  let line = 1;
  for (let i = 0; i < index; i++) {
    if (text[i] === '\n') {
      line++;
    }
  }
  return line;
}

If you want to cope with different possibly line endings, you can either preprocess the text (recommended):

text = text.replace(/\r\n|\r/g, '\n');

Or you can use these, more complex, solutions:

function getLineNumber(text, index) {
  const match = text.slice(0, index).match(/\r\n|\r|\n/g);
  return (match ? match.length : 0) + 1;
}
function getLineNumber(text, index) {
    let line = 1;
    for (let i = 0; i < index; i++) {
      if (text[i] === '\n') {
        line++;
      }
      if (text[i] === '\r') {
        // A line feed after a carriage return counts as part of the same newline
        if (text[i + 1] === '\n') {
          i++;
        }
        line++;
      }
    }
    return line;
  }

Upvotes: 0

Ga&#235;l Barbin
Ga&#235;l Barbin

Reputation: 3939

Your pseudo-code seems to do the job. But I do not see how you can infer the line number by the offset of the searched word. I would split the input text by lines, then look over the array for the searched word, and if found return the line index.

var input= "word word2 \n"+
           "someword blah";


function getLinesNumberOf( input, word){
  var line_numbers=[];
  input.split("\n").forEach(function(line, index){
    if( line.indexOf(word)>=0 ) line_numbers.push(index);
  });
  return line_numbers;
}


console.log(getLinesNumberOf(input,"someword"));

I have add support for multiple occurences of the searched word.

edit

To avoid too memory consumption with large inputs, you can parse sequentially (for the same avantanges of SAX vs DOM):

function getLinesNumberOf( word, input ){

    input+= "\n";//make sure to not miss the last line;

    var line_numbers=[], current_line=0;
    var startline_offset=0;

    do{
        //get the offset next of the next breakline 
        endline_offset= input.indexOf("\n",startline_offset);

        //get the offset of the searched word in the line 
        word_offset= input.substring(startline_offset,endline_offset).indexOf(word, 0);

        //check if the searched word has been found and if it has been found on current_line
        if( word_offset >= 0 && word_offset < endline_offset ) {
            //if true the current_line is stored
            line_numbers.push(current_line);
        }

        //the offset of the next line is just after the breakline offset  
        startline_offset= endline_offset+1;

        current_line++;

    }while(endline_offset>=0);//we continue while a breakline is found

    console.log(line_numbers);
}

Upvotes: 0

gwer
gwer

Reputation: 614

  1. Cut input (a.substr(0, 11)).
  2. Split it (a.substr(0, 11).split('\n')).
  3. Count it (a.substr(0, 11).split('\n').length).

Upvotes: 3

Related Questions