Reputation: 1302
I'm looking for an algorithm that will allow me to parse inline markdown elements (so strong emphasis between **
or __
, regular emphasis between *
or _
, etc.) that avoids self-nesting these elements.
The CommonMark algorithm for this does not prevent self-nesting, so an input like **some **random** text**
will get interpreted as <strong>some <strong>random</strong> text</strong>
.
Since nesting <strong>
or <em>
tags does not make the text even more bold or italic, I don't think it makes sense to parse the input in this way and would prefer to leave the asterisks that do not contribute to any additional styling, so the output would look like <strong>some **random** text</strong>
.
I know I could fix this in the code generation stage instead of the parsing stage, by letting the AST visitor check if we're already emitting into a <strong>
tag and then based on that deciding to emit asterisks instead, but since underscores can be used as well, the delimiter character used in the source text now also has to be added to the token, and this overall just feels like the wrong thing to do anyway. The text shouldn't have been parsed this way from the beginning.
The issue with the CommonMark algorithm is, that since it uses backtracking to find matching opening delimiters for closing delimiters (instead of the other way around which seems to be more common in other markdown-variant parsers that I have found), it will wrap nodes that were already marked as emphasis earlier into a new emphasis node.
One thought I had was to search for the opening delimiter furthest from the closing delimiter instead of the closest, but that would mean inputs such as **some** random **text**
would get interpreted as <strong>some** random **text</strong>
, effectively allowing only a single emphasis in the whole line of text.
So the question is:
Does there already exists an algorithm that does this? Or is there a way to modify the CommonMark algorithm to prevent this from happening?
Upvotes: 2
Views: 326
Reputation: 1480
This way you can also remove unallowed special characters.
Tokens can be words, special characters or emphasis tags (like **)
The idea behind the below code is:
parsedText
to be returned)And here is a JavaScript implementation of this:
function tokenise(str, specCharsDict) {
const matches = str.matchAll(/[^\w\*\_]/g);
for(const [match] of matches) {
str = str.replace(match, specCharsDict[match] || "&qmk#");
}
const tokenArr = str.split(/(\*{1,2})|(\_{1,2})|(\&\w{3}\#)/).filter(Boolean);
return tokenArr;
}
function parseMarkdown(str) {
const specCharsDict = {
" ": "&spc#",
"!": "&xcl#",
"?": "&qmk#",
".": "&prd#",
",": "&cma#",
"#": "&hsh#",
"&": "&aps#"
// todo - add more allowed characters here, like ":", ";", etc
// unallowed characters are replaced by a "?"
};
const emphasisTags = {
open: {
single: "<em>",
double: "<strong>"
},
close: {
single: "</em>",
double: "</strong>"
}
};
const tokenArr = tokenise(str, specCharsDict);
let parsedText = [];
const tokenDataArr = [];
let emphasisCounter = {
single: 0,
double: 0
};
for (let i = 0; i < tokenArr.length; i++) {
const tokenData = {
token: tokenArr[i],
type: "word",
emphasisType: null, // single (*) or double (**)
emphasisTagType: null,
index: i
};
tokenDataArr.push(tokenData);
if(tokenData.token.includes("*") || tokenData.token.includes("_")) {
tokenData.type = "emphasis";
tokenData.emphasisType = tokenData.token.length === 1 ? "single" : "double";
} else if(tokenData.token.includes("&")) {
tokenData.type = "spec char";
}
switch(tokenData.type) {
case "word":
parsedText.push(tokenData.token);
break;
case "spec char":
const specCharIndex = Object.values(specCharsDict).indexOf(tokenData.token);
parsedText.push(Object.keys(specCharsDict)[specCharIndex]);
break;
case "emphasis":
const prevTokenData = tokenDataArr[i-1];
const prevWasSpace = prevTokenData === undefined || prevTokenData.token === specCharsDict[" "];
if(prevWasSpace) {
tokenData.emphasisTagType = "opening";
if(emphasisCounter[tokenData.emphasisType] === 0) {
emphasisCounter[tokenData.emphasisType]++;
parsedText.push(emphasisTags.open[tokenData.emphasisType]);
} else {
parsedText.push(tokenData.token);
emphasisCounter[tokenData.emphasisType]++;
}
} else {
tokenData.emphasisTagType = "closing";
if(emphasisCounter[tokenData.emphasisType] === 1) {
emphasisCounter[tokenData.emphasisType] = 0;
parsedText.push(emphasisTags.close[tokenData.emphasisType]);
} else {
parsedText.push(tokenData.token);
emphasisCounter[tokenData.emphasisType]--;
}
}
break;
}
}
// todo parse emphasisCounter
// if either double or single is not 0, then there is an unclosed or unopened tag
// if < 0, then there's an unopened tag
// if > 0, then there's an unclosed tag
let firstOpenIndex = Infinity;
let numOpened = 0;
for(const emType of ["single", "double"]) {
if(emphasisCounter[emType] < 0) {
for (const tokenData of tokenDataArr) {
if(tokenData.emphasisTagType !== "opening") {
continue;
}
parsedText[tokenData.index] = emphasisTags.open[tokenData.emphasisType];
if(tokenData.index < firstOpenIndex) {
firstOpenIndex = tokenData.index;
}
emphasisCounter[emType]++;
numOpened++;
if(emphasisCounter[emType] === 0) {
break;
}
}
}
if(emphasisCounter[emType] > 0) {
for (let i = tokenDataArr.length-1; i >=0 ; i--) {
const tokenData = tokenDataArr[i];
if(tokenData.emphasisTagType !== "closing") {
continue;
}
parsedText[tokenData.index] = emphasisTags.close[tokenData.emphasisType];
emphasisCounter[emType]--;
if(emphasisCounter[emType] === 0) {
break;
}
}
}
if(firstOpenIndex !== Infinity) {
for (let i = firstOpenIndex; i < tokenDataArr.length; i++) {
const tokenData = tokenDataArr[i];
if(tokenData.emphasisTagType !== "closing") {
continue;
}
parsedText[tokenData.index] = emphasisTags.close[tokenData.emphasisType];
numOpened--;
if(numOpened === 0) {
break;
}
}
}
}
return parsedText.join("");
}
const testStrings = [
"Weak** weak** **strong**",
"Weak** **strong**",
"**Strong** and **strong**",
"**Strong **not nested** endstrong**",
"**strong** *italic* __strong, meanwhile *italic*__"
];
testStrings.forEach(testStr => {
const parsed = parseMarkdown(testStr);
console.log(parsed);
document.write(parsed + "<br>");
});
Any problems or misunderstood requriements, let me know.
Upvotes: 1