Reputation:
I am trying to do regex validation on a string to ensure it matches a pattern.
Pattern is <n>...</n>
however my recursive validation breaks as we go deeper into the recursive tree.
var is_valid = true;
function is_string_valid(input_string) {
var regex_tag_set = /\<([0-9])+\>.*?\<\/\1+\>|\{[a-zA-z0-9\_]*?\}/g;
var regex_tags = /^(\<[0-9]+\>)|<\/?([0-9]+)>(?!.*<\/?[0-9]+>)/g;
var regex_tag_single = /\<\/?[0-9]+\>/g;
console.log("start function");
console.log(input_string);
var matches = input_string.match(regex_tag_set);
var matches_count = (matches || []).length;
console.log(matches)
console.log(matches_count)
if (matches_count > 0) {
for (var i = 0; i < matches_count; i++) {
if (is_valid) {
matches[i] = matches[i].replace(regex_tags, "");
console.log(matches[i]);
is_string_valid(matches[i]);
}
}
}
else {
var final_matches = input_string.match(regex_tag_single);
var final_count = (final_matches || []).length;
if (final_count > 0) {
console.log("Invalid; internal tags do not match <n>...</n> pattern."); //this is not being checked on nests.
is_valid = false;
}
return is_valid;
}
return is_valid;
}
var str = "Life <0>is <1>amazingly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.";
console.log(is_string_valid(str));
var str = "Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.";
console.log(is_string_valid(str));
More Detail: I'm creating a custom tag parser in Javascript.
the pattern is <n>...</n>
.
Examples of strings:
//This will be invalid because it doesn't follow patter in the nest.
Life <0>is </1>amazingly beautiful<1></0> and <2>programming</2> is <3>fun</3>.
//This will be invalid because it was an close tag nested.
Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.
//This should be valid
Life <0>is <1>amazingly beautiful</1></0> and <2>programming</2> is <3>fun</3>.
//This should be valid
Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>programming</3> is <4>fun</4>.
//This should be valid
Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>progr<4>amming</4></3> is <4>fun</4>.
Upvotes: 1
Views: 212
Reputation: 214949
One useful technique is to have a regexp <start-tag> no tags <end-tag>
and eliminate it in a loop until there are no more matches. Then test if what's left contains any tags:
function test(s) {
let re = /<(\d+)>[^<>]*<\/\1>/g
while (true) {
let s2 = s.replace(re, '');
if (s2 === s)
return !/[<>]/.test(s)
s = s2;
}
}
//
strs = [
`Life <0>is </1>amazingly beautiful<1></0> and <2>programming</2> is <3>fun</3>.`,
`Life <0>is <1>amazi</3>ngly <2>beau<3>tif</3>ul</2></1></0> and <3>progra<4>mm</4>ing</3> is <4>fun</4>.`,
`Life <0>is <1>amazingly beautiful</1></0> and <2>programming</2> is <3>fun</3>.`,
`Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>programming</3> is <4>fun</4>.`,
`Life <0>is <1>amazingly <2>beautiful</2></1></0> and <3>progr<4>amming</4></3> is <4>fun</4>.`,
]
console.log(...strs.map(test))
That said, a stack-based parser would be more readable and robust:
let lex = String.raw`
< (?<start> \w+) >
|
</ (?<end> \w+) >
|
.
`.replace(/\s+/g, '')
function test2(s) {
let stack = [];
for (let m of s.matchAll(lex)) {
let g = m.groups;
if (g.start)
stack.push(g.start)
else if (g.end && stack.pop() !== g.end)
return false
}
return stack.length === 0
}
With two kinds of tags, for example, <n>
and {...}
, this is more complicated. You cannot simply OR them together in the 'cleaner' regexp, like
/<(\d+)> [^<>]* <\/\1> | { [^{}]* }/ // wrong!
because it would accept improperly nested things:
fail <0> { fail </0> fail } // accepted??
You'll have to do the cleanup in two steps: first, remove everything that doesn't look like markup, second, repeat the cleanup loop, but this time with a 'hollow' regexp <begin><end>|<begin2><end2>
Again, a stack-based option would probably work better, I've adapted it to ES3 (I hope).
function test_cleanup(s) {
let filter = /(<\/?\d+>)|([{}])|./g
s = s.replace(filter, function (_, tag, paren) {
return tag|| paren || '';
});
let bracket = /<(\d+)><\/\1>|{}/g
while (true) {
let s2 = s.replace(bracket, '');
if (s2 === s)
return s.length === 0;
s = s2;
}
}
function test_stack(s) {
let lex = /<(\d+)>|<\/(\d+)>|([{}])|./g;
let stack = [];
let valid = true;
s.replace(lex, function (_, open, close, paren) {
if (!valid)
return
if (open)
return stack.push(open)
if (close && stack.pop() !== close)
return valid = false;
if (paren === '{')
return stack.push(paren)
if (paren === '}' && stack.pop() !== '{')
return valid = false
});
return valid && stack.length === 0;
}
strs = [
'fail <0> fail </1>',
'fail { fail {',
'fail <0> { fail } </0> fail<1>',
'fail <0> { fail </0> fail }',
'{ok <0> { ok } </0> ok }',
'ok <0> { ok <1> ok <2> { ok { ok} ok } </2> ok </1> ok } </0>',
]
console.log(...strs.map(test_cleanup))
console.log(...strs.map(test_stack))
Upvotes: 1