Matius Nugroho Aryanto
Matius Nugroho Aryanto

Reputation: 901

How to convert given text to array with given key in Javascript

I am creating a simple OCR app that read an information from a card and succeeded convert the image to text. Here the sample of converted text

Name :Hulk Color + Green Type Hero

Name :Venom Color :Black Type Anti Hero

We can see that sometimes colon converted to + or another symbol, the point is how to convert that string to an object with predefined key (in our case is name, color, and type) what is the best approach to do that?

Upvotes: 0

Views: 89

Answers (2)

Gabriel
Gabriel

Reputation: 690

Something like this, i assumed your output has multiple lines and the names or colors doesn't include non alphanumeric characters, if they do you will have to exclude them from the regex.

let myArray = [];
let string = `Name :Hulk Color + Green Type Hero
Name :Venom Color :Black Type Anti Hero`;
// split by newline so we get the lines
let lines = string.split("\n");
// console.log(lines);
lines.forEach(transform);

function transform(line) {
  let obj = {};
  let parts = line.split(/Color|Type/);
  obj['name'] = parts[0].replace(/[^0-9a-z ]/gi, "").replace("Name", "").replace(/\s\s+/g, ' ').trim();
  obj['color'] = parts[1].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim();
  obj['type'] = parts[2].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim();
  myArray.push(obj);
  // return obj;
}
// console.log(myArray);

// =============================== use array to split
let splitBy = ['name', 'color', 'type'];
let hero = 'Name :Hulk Color + Green Type Hero';

function heroToObj(hero, splitBy) {
  let obj = {};
  const regex = new RegExp(splitBy.join('|'), 'i');
  var parts = hero.split(regex);
  splitBy.forEach((key,index)=>obj[key] = parts[index+1].replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').trim() )
  return obj;
}
const heroObject = heroToObj(hero, splitBy);
console.log(heroObject);

=========================== another example ==================

let hero = 'Name :Hulk\nColor + Green\nType Hero\nAnother text \nanother some texts';
let myArray = [];
// split by newline so we get the lines
let lines = hero.split("\n");
let obj = {};
let splitBy = ['name', 'color', 'type'];
lines.forEach(heroToObj);
function heroToObj(line) {
  splitBy.forEach((prop) => {
    var regEx = new RegExp(prop, "ig");
    // clean the line to account for  '+Name :Hulk\n Color + Green\n-Type Hero\nAnother text \another some texts'
    line = line.replace(/[^0-9a-z ]/gi, "").trim();
    if (line.toLowerCase().startsWith(prop.toLowerCase())) obj[prop] = line.replace(/[^0-9a-z ]/gi, "").replace(/\s\s+/g, ' ').replace(regEx, '').trim()
  })
}

console.log(obj);

Upvotes: 1

pilchard
pilchard

Reputation: 12919

Here's a quick example using named capturing groups and using the category/property names ['Name', 'Color', Type'] as delimeters within the string capturing all characters between those words into appropriately named groups [(?<name>.*), (?<color>.*), (?<type>.*)], and setting the i flag for case-insensitive matching.

const regex = /Name(?<name>.*)Color(?<color>.*)Type(?<type>.*)/i,

When calling exec() on this regular expression the named matches will be available within the groups property of the returned matches array.

From there you simply need to access each named group and clean its associated string, here using a regex to replace non-word (\W which matches [^A-Za-z0-9_]) characters at either the beginning or end of the string.

const clean = (s) => s.replace(/^\W+|\W+$/, '');

Finally, combine them into a single object, here using a for...of loop to iterate the Object.keys() of the groups object, clean each matched string and assign it to the result object.

const parse_ocr_string = (str) => {

  const clean = (s) => s.replace(/^\W+|\W+$/g, '');

  const regex = /Name(?<name>.*)Color(?<color>.*)Type(?<type>.*)/i;
  const { groups } = regex.exec(str);

  const result = {};
  for (const k of Object.keys(groups)) {
    result[k] = clean(groups[k]);
  }

  return result;
};

const s1 = 'Name :Hulk Color + Green Type Hero';
const s2 = 'Name :Venom Color :Black Type Anti Hero';

const obj1 = parse_ocr_string(s1);
const obj2 = parse_ocr_string(s2);

console.log(obj1);
console.log(obj2);

But you could also generalize this by accepting a delimiters parameter and passing ['String', 'key'] pairs to use in declaring a regex via the new RegExp() constructor. You'll still need to adjust for other artifacts/case-sensitivity problems, but perhaps this gives you an idea of how you might proceed.

const parse_ocr_string = (str, delimiters) => {

  const clean = (s) => s.replace(/^\W+|\W+$/g, '');

  const match_string = delimiters
    .map(([seq, key]) => `${seq}(?<${key}>.*)`)
    .join('');

  const regex = new RegExp(match_string, 'i');
  const { groups } = regex.exec(str);

  const result = {};
  for (const k of Object.keys(groups)) {
    result[k] = clean(groups[k]);
  }

  return result;
};

const s1 = 'Name :Hulk Color + Green Type =Hero';
// specify delimiters in some form, here tuples of ['Word', 'property']
const delimiters1 = [['Name', 'name'], ['Color', 'color'], ['Type', 'type']];

const obj1 = parse_ocr_string(s1, delimiters1);
console.log(obj1);

const s2 = 'AttaCk power: 24 Defense+ 90';
const delimiters2 = [['Attack Power', 'attack'], ['Defense', 'defense']];

const obj2 = parse_ocr_string(s2, delimiters2);
console.log(obj2);

Upvotes: 0

Related Questions