jenna_3108
jenna_3108

Reputation: 435

JavaScript Regex to find UOM in a string

I have a list of products that contains UOM in the product title. It needs automatically detect the UOM in the title by using Regex.

Expectations

I have this function below

detectMetricUnit = (title) => {
        let unit,
            regex = new RegExp(/(?:\d)/mg),
            measurement = title.match(regex) && title.match(regex)[0],
            matches = measurement && title.split(measurement)[1];

        if(matches) {
            if(/millilitre|milliliter|ml/.test(matches.toLowerCase())){
                unit = 'ml';
            } else if(/litre|liter|l/.test(matches.toLowerCase())){
                unit = 'l';
            } else if (/kilogram|kg/.test(matches.toLowerCase())) {
                unit = 'kg';
            } else if (/gram|g/.test(matches.toLowerCase())) {
                unit = 'g';
            }
        }

        return unit;
    }

However I have some problematic strings such as

Appreciate if someone could point out my mistake in my regex. How do I actually get the full integers and find the UOM attached next to it even with a space?

Upvotes: 1

Views: 330

Answers (1)

Wiktor Stribiżew
Wiktor Stribiżew

Reputation: 627219

You may define a dictionary of possible UOMs you want to detect and then build a regex similar to

/(\d+(?:\.\d+)?)\s?(millilitre|milliliter|ml|litre|liter|l|kilogram|kg|gram|g)\b/i

See the regex demo. The (\d+(?:\.\d+)?) part will capture an integer or float value into Group 1, then \s? match an optional whitespace (change to \s* to match 0 or more whitespaces), and then (millilitre|milliliter|ml|litre|liter|l|kilogram|kg|gram|g)\b will capture UOM unit into Group 2 as a whole word (due to \b word boundary).

Here is the JS implementation to get the first UOM from string:

let strs = ['Banana Yogurt 70ml', 'Fish Nuggets 200G', 'Potato Wedges 200 G', 'Chocolate Drink 330ML X 24']
let dct = {millilitre: 'ml', milliliter: 'ml', ml: 'ml', litre:'l', liter: 'l', l: 'l', kilogram: 'kg', kg: 'kg', gram: 'g', g: 'g'}

detectMetricUnit = (title) => {
        let unit, match, val,
            regex = new RegExp("(\\d+(?:\\.\\d+)?)\\s?(" + Object.keys(dct).join("|") + ")\\b", "i");
            match = title.match(regex);

        if (match) {
            val = match[1];
            unit = dct[match[2].toLowerCase()]
        }
        return [val, unit];
    }

strs.forEach(x => console.log(detectMetricUnit(x)) )

To get all of them, multiple occurrences:

let strs = ['Banana Yogurt 70ml and Fish Nuggets 200G', 'Potato Wedges 200 G and Chocolate Drink 330ML X 24']
let dct = {millilitre: 'ml', milliliter: 'ml', ml: 'ml', litre:'l', liter: 'l', l: 'l', kilogram: 'kg', kg: 'kg', gram: 'g', g: 'g'}

detectMetricUnit = (title) => {
        let match, results = [],
            regex = new RegExp("(\\d+(?:\\.\\d+)?)\\s?(" + Object.keys(dct).join("|") + ")\\b", "ig");
        
        while (match=regex.exec(title)) {
            results.push([ match[1], dct[match[2].toLowerCase()] ]);
        }
        return results;
    }

strs.forEach(x => console.log(x, detectMetricUnit(x)) )

Upvotes: 1

Related Questions