Reputation: 901
I am creating a simple OCR app that read an information from a card and succeeded convert the image to text. Here the sample of converted text
Name :Hulk Color + Green Type Hero
Name :Venom Color :Black Type Anti Hero
We can see that sometimes colon converted to + or another symbol, the point is how to convert that string to an object with predefined key (in our case is name, color, and type) what is the best approach to do that?
Upvotes: 0
Views: 89
Reputation: 690
Something like this, i assumed your output has multiple lines and the names or colors doesn't include non alphanumeric characters, if they do you will have to exclude them from the regex.
let myArray = [];
let string = `Name :Hulk Color + Green Type Hero
Name :Venom Color :Black Type Anti Hero`;
// split by newline so we get the lines
let lines = string.split("\n");
// console.log(lines);
lines.forEach(transform);
function transform(line) {
let obj = {};
let parts = line.split(/Color|Type/);
obj['name'] = parts[0].replace(/[^0-9a-z ]/gi, "").replace("Name", "").replace(/\s\s+/g, ' ').trim();
obj['color'] = parts[1].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim();
obj['type'] = parts[2].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim();
myArray.push(obj);
// return obj;
}
// console.log(myArray);
// =============================== use array to split
let splitBy = ['name', 'color', 'type'];
let hero = 'Name :Hulk Color + Green Type Hero';
function heroToObj(hero, splitBy) {
let obj = {};
const regex = new RegExp(splitBy.join('|'), 'i');
var parts = hero.split(regex);
splitBy.forEach((key,index)=>obj[key] = parts[index+1].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim() )
return obj;
}
const heroObject = heroToObj(hero, splitBy);
console.log(heroObject);
=========================== another example ==================
let hero = 'Name :Hulk\nColor + Green\nType Hero\nAnother text \nanother some texts';
let myArray = [];
// split by newline so we get the lines
let lines = hero.split("\n");
let obj = {};
let splitBy = ['name', 'color', 'type'];
lines.forEach(heroToObj);
function heroToObj(line) {
splitBy.forEach((prop) => {
var regEx = new RegExp(prop, "ig");
// clean the line to account for '+Name :Hulk\n Color + Green\n-Type Hero\nAnother text \another some texts'
line = line.replace(/[^0-9a-z ]/gi, "").trim();
if (line.toLowerCase().startsWith(prop.toLowerCase())) obj[prop] = line.replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').replace(regEx, '').trim()
})
}
console.log(obj);
Upvotes: 1
Reputation: 12919
Here's a quick example using named capturing groups and using the category/property names ['Name', 'Color', Type'] as delimeters within the string capturing all characters between those words into appropriately named groups [(?<name>.*)
, (?<color>.*)
, (?<type>.*)
], and setting the i
flag for case-insensitive matching.
const regex = /Name(?<name>.*)Color(?<color>.*)Type(?<type>.*)/i,
When calling exec()
on this regular expression the named matches will be available within the groups
property of the returned matches array.
From there you simply need to access each named group and clean its associated string, here using a regex to replace non-word (\W
which matches [^A-Za-z0-9_]) characters at either the beginning or end of the string.
const clean = (s) => s.replace(/^\W+|\W+$/, '');
Finally, combine them into a single object, here using a for...of
loop to iterate the Object.keys()
of the groups
object, clean each matched string and assign it to the result object.
const parse_ocr_string = (str) => {
const clean = (s) => s.replace(/^\W+|\W+$/g, '');
const regex = /Name(?<name>.*)Color(?<color>.*)Type(?<type>.*)/i;
const { groups } = regex.exec(str);
const result = {};
for (const k of Object.keys(groups)) {
result[k] = clean(groups[k]);
}
return result;
};
const s1 = 'Name :Hulk Color + Green Type Hero';
const s2 = 'Name :Venom Color :Black Type Anti Hero';
const obj1 = parse_ocr_string(s1);
const obj2 = parse_ocr_string(s2);
console.log(obj1);
console.log(obj2);
But you could also generalize this by accepting a delimiters
parameter and passing ['String', 'key']
pairs to use in declaring a regex via the new RegExp()
constructor. You'll still need to adjust for other artifacts/case-sensitivity problems, but perhaps this gives you an idea of how you might proceed.
const parse_ocr_string = (str, delimiters) => {
const clean = (s) => s.replace(/^\W+|\W+$/g, '');
const match_string = delimiters
.map(([seq, key]) => `${seq}(?<${key}>.*)`)
.join('');
const regex = new RegExp(match_string, 'i');
const { groups } = regex.exec(str);
const result = {};
for (const k of Object.keys(groups)) {
result[k] = clean(groups[k]);
}
return result;
};
const s1 = 'Name :Hulk Color + Green Type =Hero';
// specify delimiters in some form, here tuples of ['Word', 'property']
const delimiters1 = [['Name', 'name'], ['Color', 'color'], ['Type', 'type']];
const obj1 = parse_ocr_string(s1, delimiters1);
console.log(obj1);
const s2 = 'AttaCk power: 24 Defense+ 90';
const delimiters2 = [['Attack Power', 'attack'], ['Defense', 'defense']];
const obj2 = parse_ocr_string(s2, delimiters2);
console.log(obj2);
Upvotes: 0