user14067569
user14067569

Reputation:

Recursively iterate to validate pattern

I am trying to do regex validation on a string to ensure it matches a pattern.

Pattern is <n>...</n> however my recursive validation breaks as we go deeper into the recursive tree.

var is_valid = true;

function is_string_valid(input_string) {
  var regex_tag_set = /\<([0-9])+\>.*?\<\/\1+\>|\{[a-zA-z0-9\_]*?\}/g;
  var regex_tags = /^(\<[0-9]+\>)|<\/?([0-9]+)>(?!.*<\/?[0-9]+>)/g;
  var regex_tag_single = /\<\/?[0-9]+\>/g;
  console.log("start function");
  console.log(input_string);
  
  var matches = input_string.match(regex_tag_set);
  var matches_count = (matches || []).length;
  console.log(matches)
  console.log(matches_count)

  if (matches_count > 0) {
    for (var i = 0; i < matches_count; i++) {
      if (is_valid) {
        matches[i] = matches[i].replace(regex_tags, "");
        console.log(matches[i]);
        is_string_valid(matches[i]);
      }
    }
  }
  else {
    var final_matches = input_string.match(regex_tag_single);
    var final_count = (final_matches || []).length;
    if (final_count > 0) {
      console.log("Invalid; internal tags do not match <n>...</n> pattern."); //this is not being checked on nests.
      is_valid = false;
    }
    return is_valid;
  }
  return is_valid;
}

var str = "Life <0>is <1>amazingly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.";
console.log(is_string_valid(str));

var str = "Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.";
console.log(is_string_valid(str));

More Detail: I'm creating a custom tag parser in Javascript.

the pattern is <n>...</n>.

Examples of strings:

//This will be invalid because it doesn't follow patter in the nest.
Life <0>is </1>amazingly beautiful<1></0> and <2>programming</2> is <3>fun</3>.

//This will be invalid because it was an close tag nested.
Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.

//This should be valid
Life <0>is <1>amazingly beautiful</1></0> and <2>programming</2> is <3>fun</3>.

//This should be valid
Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>programming</3> is <4>fun</4>.

//This should be valid
Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>progr<4>amming</4></3> is <4>fun</4>.

Upvotes: 1

Views: 212

Answers (1)

georg
georg

Reputation: 214949

One useful technique is to have a regexp <start-tag> no tags <end-tag> and eliminate it in a loop until there are no more matches. Then test if what's left contains any tags:

function test(s) {
    let re = /<(\d+)>[^<>]*<\/\1>/g

    while (true) {
        let s2 = s.replace(re, '');
        if (s2 === s)
            return !/[<>]/.test(s)
        s = s2;
    }
}

//

strs = [
    `Life <0>is </1>amazingly beautiful<1></0> and <2>programming</2> is <3>fun</3>.`,
    `Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.`,
    `Life <0>is <1>amazingly beautiful</1></0> and <2>programming</2> is <3>fun</3>.`,
    `Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>programming</3> is <4>fun</4>.`,
    `Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>progr<4>amming</4></3> is <4>fun</4>.`,
]




console.log(...strs.map(test))

That said, a stack-based parser would be more readable and robust:

let lex = String.raw`
    < (?<start> \w+) >
    |
    </ (?<end> \w+) >
    |
    .
`.replace(/\s+/g, '')

function test2(s) {
    let stack = [];

    for (let m of s.matchAll(lex)) {
        let g = m.groups;

        if (g.start)
            stack.push(g.start)
        else if (g.end && stack.pop() !== g.end)
            return false
    }

    return stack.length === 0
}

With two kinds of tags, for example, <n> and {...}, this is more complicated. You cannot simply OR them together in the 'cleaner' regexp, like

 /<(\d+)> [^<>]* <\/\1> | { [^{}]* }/  // wrong!

because it would accept improperly nested things:

 fail <0> { fail </0> fail }  // accepted??

You'll have to do the cleanup in two steps: first, remove everything that doesn't look like markup, second, repeat the cleanup loop, but this time with a 'hollow' regexp <begin><end>|<begin2><end2>

Again, a stack-based option would probably work better, I've adapted it to ES3 (I hope).

function test_cleanup(s) {

    let filter = /(<\/?\d+>)|([{}])|./g

    s = s.replace(filter, function (_, tag, paren) {
        return tag|| paren || '';
    });

    let bracket = /<(\d+)><\/\1>|{}/g

    while (true) {
        let s2 = s.replace(bracket, '');
        if (s2 === s)
            return s.length === 0;
        s = s2;
    }
}

function test_stack(s) {

    let lex = /<(\d+)>|<\/(\d+)>|([{}])|./g;
    let stack = [];
    let valid = true;

    s.replace(lex, function (_, open, close, paren) {
        if (!valid)
            return
        if (open)
            return stack.push(open)
        if (close && stack.pop() !== close)
            return valid = false;
        if (paren === '{')
            return stack.push(paren)
        if (paren === '}' && stack.pop() !== '{')
            return valid = false
    });

    return valid && stack.length === 0;
}


strs = [
    'fail <0> fail </1>',
    'fail { fail {',
    'fail <0> { fail } </0> fail<1>',
    'fail <0> { fail </0> fail }',
    '{ok <0> { ok } </0> ok }',
    'ok <0> { ok <1> ok <2> { ok { ok} ok } </2> ok </1> ok } </0>',
]


console.log(...strs.map(test_cleanup))
console.log(...strs.map(test_stack))

Upvotes: 1

Related Questions