Reputation: 66488

Split text into equal length strings keeping words intact

I have this code that break longer lines into array of equal length strings keeping words it also take into account the formatting like [[u;#fff;]some text], it split text so each string can be converted into html independently:

var format_re = /\[\[([!gbiuso]*;[^;\]]*;[^;\]]*(?:;|[^\]()]*);?[^\]]*)\]([^\]]*\\\][^\]]*|[^\]]*|[^\[]*\[[^\]]*)\]?/gi;
var format_begin_re = /(\[\[[!gbiuso]*;[^;]*;[^\]]*\])/i;
var format_last_re = /\[\[[!gbiuso]*;[^;]*;[^\]]*\]?$/i;
$.terminal.split_equal = function(str, length, words) {
  var formatting = false;
  var in_text = false;
  var prev_format = '';
  var result = [];
  // add format text as 5th paramter to formatting it's used for
  // data attribute in format function
  var array = str.replace(format_re, function(_, format, text) {
    var semicolons = format.match(/;/g).length;
    // missing semicolons
    if (semicolons == 2) {
      semicolons = ';;';
    } else if (semicolons == 3) {
      semicolons = ';';
    } else {
      semicolons = '';
    }
    // return '[[' + format + ']' + text + ']';
    // closing braket will break formatting so we need to escape
    // those using html entity equvalent
    return '[[' + format + semicolons +
      text.replace(/\\\]/g, '&#93;').replace(/\n/g, '\\n') + ']' +
      text + ']';
  }).split(/\n/g);
  for (var i = 0, len = array.length; i < len; ++i) {
    if (array[i] === '') {
      result.push('');
      continue;
    }
    var line = array[i];
    var first_index = 0;
    var count = 0;
    var space = -1;
    for (var j=0, jlen=line.length; j<jlen; ++j) {
      if (line[j] === '[' && line[j+1] === '[') {
        formatting = true;
      } else if (formatting && line[j] === ']') {
        if (in_text) {
          formatting = false;
          in_text = false;
        } else {
          in_text = true;
        }
      } else if ((formatting && in_text) || !formatting) {
        if (line[j] === '&') { // treat entity as one character
          var m = line.substring(j).match(/^(&[^;]+;)/);
          if (!m) {
            // should never happen if used by terminal,
            // because it always calls $.terminal.encode
            // before this function
            throw new Error("Unclosed html entity in line " +
                            (i+1) + ' at char ' + (j+1));
          }
          j+=m[1].length-2; // because continue adds 1 to j
          // if entity is at the end there is no next loop
          // issue #77
          if (j === jlen-1) {
            result.push(output + m[1]);
          }
          continue;
        } else if (line[j] === ']' && line[j-1] === '\\') {
          // escape \] counts as one character
          --count;
        } else {
          ++count;
        }
      }
      function is_space() {
        return line.substring(j-6, j) == '&nbsp;' ||
          line.substring(j-1, j) == ' ';
      }
      if (is_space() && ((formatting && in_text) || !formatting)) {
        space = j;
      }
      if ((count === length || j === jlen-1) &&
          ((formatting && in_text) || !formatting)) {
        var output;
        var after = line.substring(space, j+length+1);
        var text = $('<span>' + after + '</span>').text();
        var can_break = text.match(/\s/);
        if (words && space != -1 && j !== jlen-1 && can_break) {
          // get text to last space
          output = line.substring(first_index, space);
          j = space-1;
          space = -1;
        } else {
          output = line.substring(first_index, j+1);
        }
        if (words) {
          output = output.replace(/^(&nbsp;|\s)+|(&nbsp;|\s)+$/g, '');
        }
        first_index = j+1;
        count = 0;
        if (prev_format) {
          output = prev_format + output;
          if (output.match(']')) {
            prev_format = '';
          }
        }
        // Fix output if formatting not closed
        var matched = output.match(format_re);
        if (matched) {
          var last = matched[matched.length-1];
          if (last[last.length-1] !== ']') {
            prev_format = last.match(format_begin_re)[1];
            output += ']';
          } else if (output.match(format_last_re)) {
            var line_len = output.length;
            // why this line ???
            //var f_len = line_len-last[last.length-1].length;
            output = output.replace(format_last_re, '');
            prev_format = last.match(format_begin_re)[1];
          }
        }
        result.push(output);
      }
    }
  }
  return result;
};

It work almost right but some lines are shorter then it should like:

is cracker.The term

in this FIDDLE, it work right when you strip formatting, checking checkbox. I work on this for couple of hours and have no clue why that line is shorter, any help will be very appreciated.

Upvotes: 7

Answers (3)

João Pimentel Ferreira

Reputation: 16233

The npm package paragraph-builder splits continued text into so called paragraphs evenly distributed and all approximately with the same size in number of words. This concept of paragraph seems to be what you search for.

You can define the number of words for the paragraphs. You can extend the principle of paragraphs to pages, considering that a page has on average approximately the same number of characters, space included.

This paragraph builder node script generates paragraphs from continuous text. It outputs a text wherein the size of each paragraph is approximately the same, providing an even distribution of paragraphs within the text. It doesn't split the text on numbers such as "1.2".

There is an option to define the break character between paragraphs or you can fetch the paragraphs into an array of strings from which you can apply the html tag <p>. Check its documentation for further clarification.

Upvotes: 1

heenenee

Reputation: 20125

Here's how to fix the original code:

Add the following after line 40:

in_text = false;

The in_text flag is used by the code to determine if the current position is in regular text. However, it was not clearing the flag when it entered a region of formatting markup. This was the cause care of the main issue described in the question with the ultra-short line.

Change the if statement at line 76/77 to:

if (is_space() && ((formatting && in_text) || !formatting || (line[j] === '[' && line[j+1] === '['))) {

This takes care of a lesser problem where line breaks were not happening on spaces between regular text and formatted text.

Working fiddle here: https://jsfiddle.net/2w10xp3m/1/

Upvotes: 5

huysentruitw

Reputation: 28091

I think I've solved the problem using a much simpler approach. First break up all words, then re-assemble the lines while keeping track of the current format. See JsFiddle.

JavaScript

$.terminal.split_equal = function(str, length, words) {
  var result = [],
    currentFormat = null,
    currentLine = '',
    currentLineLengthWithoutFormatting = 0;

  // 1. Split words on &nbsp;
  words = str.split(/&nbsp;/g);

  // 2. Re-assemble lines while keeping track of current formats
  words.forEach(function(word) {
    // Keep track of current format
    var format = word.match(/^\[\[([^\]]+)\]/g),
      wordWithFormatting, wordLength;
    if (format !== null && format[0]) {
      currentFormat = format[0];
      word = word.slice(format[0].length);
    }
    // Apply current format to each word separatly
    wordLength = word.length;
    wordWithFormatting = (currentFormat || '') + word;
    if (currentFormat) {
      if (word.indexOf(']') !== -1) {
        wordLength--;
        currentFormat = null;
      } else {
        wordWithFormatting += ']';
      }
    }
    // Assemble line
    if (currentLineLengthWithoutFormatting + wordLength <= length) {
      // Word still fits on current line
      if (currentLineLengthWithoutFormatting > 0) {
        currentLine += ' ';
        currentLineLengthWithoutFormatting++;
      }
    } else {
      // Need to start new line
      result.push(currentLine);
      currentLine = '';
      currentLineLengthWithoutFormatting = 0;
    }

    currentLine += wordWithFormatting;
    currentLineLengthWithoutFormatting += wordLength;
  });

  if (currentLineLengthWithoutFormatting > 0)
    result.push(currentLine);

  return result;
};

Upvotes: 4

Split text into equal length strings keeping words intact

Answers (3)

Related Questions