Reputation: 3900
I'm using Javascript to extract parts of a string using a regex.
I have a string like:
lorem ipsum !bang #hash #hash2 ^caret word @at sym
I am trying to pull out the words beginning with the various characters - sometimes they can have a space in them, and there can be multiple of each type. So I want to convert this string to a set of values such as:
text: "lorem ipsum"
!: "bang"
#: ["hash", "hash2"]
^: "caret word"
@: "at sym"
My current regex is / ([!#^@>\/*-]\w+)/gm
. This sort of works, but it does not match spaces - so from the sample above it only produces caret
for ^
, and not the whole caret word
.
My code to do this is:
var result = {};
var re = / ([!#^@>\/*-]\w+)/gm;
var m;
var firstSpecialCharIndex = inputString.search(/ [!#^@>\/*-]/);
result["text"] = inputString.substring(0, firstSpecialCharIndex);
while ((m = re.exec(inputString)) !== null) {
if (m.index === re.lastIndex) {
re.lastIndex++;
}
var index = m[1].substring(0,1);
if(result[index] == null)
result[index] = [];
result[index].push(m[1].substring(1));
}
Does anyone know how I can match up to the next special character, including the space between multiple words (but not the space after it to the next special char)? Many thanks
Upvotes: 2
Views: 81
Reputation: 12544
Just a suggestion for an alternative, but if a regex is used to split on the specialchars (preceding with a non capturing space and lookahead alphanumeric char), while keeping the captured special char, the logic could be rewritten to:
var inputString = 'lorem ipsum !bang #hash #hash2 ^caret word @at sym'
var rx = /(?:\s)([!#^@>\/*-](?=\w))/;
var arr = inputString.split(rx);
var result = {text: arr[0]};
for(var i = 1; i < arr.length; i++){
var ind = arr[i++], val = arr[i];
var coll = (result[ind] = result[ind] || []);
coll.push(val);
}
console.log(JSON.stringify(result));
The main advantage is that the special chars are not repeated in the expression. A small secondary one is that the search is executed only once (the 'text' part is simply the first element in the results).
It will also work with multiple words and/or special chars in the middle of a word such as 'lorem ipsum !bang #ha/sh adfa #ha3sh2 ^caret word asdf @at sym'
Upvotes: 0
Reputation: 11032
I am removing text
part for simplicity. You can use lookahead
([!#^@>\/*-])(.*?)(?=\s[!#^@>\/*-]|$)
Group 1 contains symbol and group 2 contains text and you can trim the result if you want.
JS Demo
var inputString = "lorem ipsum !bang #hash #hash2 ^caret word @at sym";
var result = {};
var re = /([!#^@>\/*-])(.*?)(?=\s[!#^@>\/*-]|$)/gm;
var m;
var firstSpecialCharIndex = inputString.search(/ [!#^@>\/*-]/);
result["text"] = inputString.substring(0, firstSpecialCharIndex);
while ((m = re.exec(inputString)) !== null) {
var index = m[1];
if(result[index] == null) {
result[index] = [];
}
result[index].push(m[2].trim());
}
document.writeln("<pre>" + JSON.stringify(result) + "</pre>");
Upvotes: 1
Reputation: 3415
Try this:
/ ((?:[!#^@>\/*-]\w+)(?: [^!#^@>\/*-]\w+)*)/gm
?:
in the parenthesis makes so subpattern does not do any capturing. remove it and look what's changed.
Tested this code in the Chrome
:
var inputString = "lorem ipsum !bang #hash #hash2 ^long caret word @at sym";
var result = {};
//var re = / ([!#^@>\/*-]\w+)/gm;
var re = / ((?:[!#^@>\/*-]\w+)(?: [^!#^@>\/*-]\w+)*)/gm;
var m;
var firstSpecialCharIndex = inputString.search(/ [!#^@>\/*-]/);
result["text"] = inputString.substring(0, firstSpecialCharIndex);
while ((m = re.exec(inputString)) !== null) {
if (m.index === re.lastIndex) {
re.lastIndex++;
}
var index = m[1].substring(0,1);
if(result[index] == null) {
result[index] = [];
}
result[index].push(m[1].substring(1));
}
console.log(result);
it works well.
Upvotes: 0