Alex
Alex

Reputation: 135

group element of an array by longest common starting substring

How to group common substring by the longest common string. It refers to Group strings by longest common starting substring

Alex Shevchenko provided a very nice piece of code but in some cases it doesn't work properly. Adding to var data the value apple iphone 65 makes the data look like:

var data = ['apple iphone 65', 'apple ipad mini 32gb', 'apple ipad mini 64gb', 'apple ipad air 64gb', 'apple ipad air 32gb', 'panasonic gh4', 'samsung s2 galaxy', 'samsung s2 galaxy red', 'samsung s3 galaxy'];

Below is the expected result.

result = {
 "apple": [
     "apple iphone 65"
 ],
 "apple ipad mini": [
     "apple ipad mini 32gb",
     "apple ipad mini 64gb"
 ], ...;

And below is what I actually got.

result = {
 "apple": [
     "apple iphone 65",
     "apple ipad mini 32gb"
 ],
 "apple ipad mini": [
     "apple ipad mini 64gb"
 ],
 "apple ipad air": [
     "apple ipad air 64gb",
     "apple ipad air 32gb"
 ],
 "panasonic gh4": [
     "panasonic gh4"
 ],
 "samsung s2 galaxy": [
     "samsung s2 galaxy",
     "samsung s2 galaxy red"
 ],
 "samsung s3 galaxy": [
     "samsung s3 galaxy"
 ]
};

I cannot figure out where the mistake is in my code.

Upvotes: 4

Views: 358

Answers (2)

Mr. Polywhirl
Mr. Polywhirl

Reputation: 48610

So for this, you need to know two things, how you determine how similar two or items are and how you create the key.

I updated my response with a function that chunks each string and calculates both a macro (% same chunks) and micro threshold (% of each chunk similarity), but it still has issues with the Samsung S2 ans S3.

On a further note, the strings are non-standard, because aren't they supposed to be "samsung galaxy s2" and "samsung galaxy s3" since your format is:

<MAKE> <SERIES> <MODEL> <CAPACITY>

var data = [
  'apple ipad air 32gb', 'apple ipad air 64gb',
  'apple ipad mini 32gb', 'apple ipad mini 64gb',
  'apple iphone 65',
  'panasonic gh4',
  'samsung s2 galaxy', 'samsung s2 galaxy red',
  'samsung s3 galaxy'
];

console.log(groupData(chunkData(data.sort())));

function chunkData(data) {
  return data.map(d => d.split(' '));
}

function groupData(list) {
  let groups = {}, i = 0;
  while (i < list.length) {
    let curr = list[i];
    //console.log('Curr:', curr);
    let similar = findSimilar(curr, list.slice(i + 1), 0.80, 0.60);
    let joined = similar.map(x => x.join(' ')).concat(curr.join(' '));
    let key = sharedStart(joined).trim();
    groups[key] = joined;
    i += joined.length || 1;
  }
  return groups;
}

function findSimilar(word, words, macroThreshold, microThreshold) {
  return words.filter(w => {
    let chunks = Math.max(word.length, w.length);
    let len = Math.min(word.length, w.length);
    let wordDiff = -1, step = 0;
    while (wordDiff !== 0 && step < len) {
      let x = word[step];
      let y = w[step];
      let z = sharedStart([x, y]);
      let p = z.length / x.length;
      let q = z.length / y.length;
      //console.log('Similar:', x, y, z, p, q);
      wordDiff = (p + q) / 2;
      step++;
    }
    let chunkDiff = step / chunks;
    //console.log('Chunk %:', chunkDiff);
    return chunkDiff >= macroThreshold || wordDiff >= microThreshold;
  });
}

// https://stackoverflow.com/a/1917041/1762224
function sharedStart(array) {
  let A = array.concat().sort(), a1 = A[0], a2 = A[A.length - 1], L = a1.length, i = 0;
  while (i < L && a1.charAt(i) === a2.charAt(i)) i++;
  return a1.substring(0, i);
}
.as-console-wrapper { top: 0; max-height: 100% !important; }

Original response

You can combine Levenshtein Distance. One issue with this is that is evaluates the entire string to return the distance if different characters. If one character if off in the string, but 99% of the other characters are the same, it will be 99% similar instead of as similar up t the point where there is a difference.

The code does not produce the desired input, but it comes very close. I used a 87% (0.87) similarity index when comparing two or more strings.

// https://stackoverflow.com/a/4070350/1762224
if (String.prototype.LevenshteinDistance === undefined) {
  String.prototype.LevenshteinDistance = function(s2) {
    var array = new Array(this.length + 1);
    for (var i = 0; i < this.length + 1; i++) array[i] = new Array(s2.length + 1);
    for (var i = 0; i < this.length + 1; i++) array[i][0] = i;
    for (var j = 0; j < s2.length + 1; j++) array[0][j] = j;
    for (var i = 1; i < this.length + 1; i++) {
      for (var j = 1; j < s2.length + 1; j++) {
        if (this[i - 1] == s2[j - 1]) array[i][j] = array[i - 1][j - 1];
        else {
          array[i][j] = Math.min(array[i][j - 1] + 1, array[i - 1][j] + 1);
          array[i][j] = Math.min(array[i][j], array[i - 1][j - 1] + 1);
        }
      }
    }
    return array[this.length][s2.length];
  };
}

if (Array.prototype.longest === undefined) {
  Array.prototype.longest = function() {
    return this.sort((a, b) => b.length - a.length)[0];
  };
}

var data = [
  'apple ipad air 32gb', 'apple ipad air 64gb',
  'apple ipad mini 32gb', 'apple ipad mini 64gb',
  'apple iphone 65',
  'panasonic gh4',
  'samsung s2 galaxy', 'samsung s2 galaxy red',
  'samsung s3 galaxy'
];

console.log(groupData(data));

function groupData(list) {
  list.sort(); // Ensure they are sorted.
  let groups = {},
    i = 0;
  while (i < list.length) {
    let curr = list[i];
    let similar = findSimilar(curr, list.slice(i + 1), 0.87);
    let joined = similar.concat(curr);
    let key = joined.length > 1 ? sharedStart(joined) : curr.split(/\s+/g)[0];
    groups[key] = joined;
    i += joined.length || 1;
  }
  return groups;
}

function findSimilar(word, words, threshold) {
  return words.filter(w => {
    let longest = [w, word].longest().length;
    let dist = 1 - (threshold * longest) / longest;
    return w.LevenshteinDistance(word) < (dist * longest);
  });
}

// https://stackoverflow.com/a/1917041/1762224
function sharedStart(array) {
  let A = array.concat().sort(), a1 = A[0], a2 = A[A.length - 1], L = a1.length, i = 0;
  while (i < L && a1.charAt(i) === a2.charAt(i)) i++;
  return a1.substring(0, i);
}
.as-console-wrapper { top: 0; max-height: 100% !important; }

Upvotes: 2

Matt Croak
Matt Croak

Reputation: 2888

I'm not sure how efficient my solution is, especially if/when the dataset size increases but I think it's pretty close to what you're looking for.

var data = ['apple iphone 65gb', 'apple ipad mini 32gb', 'apple ipad mini 64gb', 'apple ipad air 64gb', 'apple ipad air 32gb', 'panasonic gh4', 'samsung s2 galaxy', 'samsung s2 galaxy red', 'samsung s3 galaxy']

let i = 0
let obj = {}

function checkArrays(arrA, arrB) {
  let index
  for (let i = 0; i < arrA.length; i++) {
    if (arrA[i] !== arrB[i]) return index = i
  }
  return index;
}

const refineArr = (data) => {
  arr = []
  for (let i = 0; i < data.length; i++) {
    let one = data[i].split("")
    let two = data[i + 1] ? data[i + 1].split("") : data[i + 1]
    if (two) {
      let x = checkArrays(one, two)

      var index1, index2

      one.forEach((y, e) => {
        if (y === " " && e >= x) {
          return index1 = e
        }
      })

      if (!arr.includes(one.slice(0, index1).join("").trim()) &&
        !arr.includes(one.slice(0, index2).join("").trim())) {
        arr.push(one.slice(0, index1).join("").trim())
      }

      two.forEach((y, i) => {
        if (y === " " && i >= x) {
          return index2 = i
        }
      })

      if (!arr.includes(two.slice(0, index2).join("").trim()) &&
        !arr.includes(two.slice(0, index1).join("").trim())) {
        arr.push(two.slice(0, index2).join("").trim())
      }
    }
  }
  var newArr = [...new Set(arr)]
  generateObject(newArr)
}

const generateObject = (arr) => {
  for (let i = 0; i < arr.length; i++) {
    if (!obj[arr[i]]) {
      obj[arr[i]] = data.filter((x) => {
        return x.includes(arr[i])
      })
    }
  }
  console.log(obj)
}

refineArr(data)
.as-console-wrapper { top: 0; max-height: 100% !important; }

First, I tried to refine the original array to just include the keys that would be included. As I looped through data I split the string at each index (and the next index so long as the item at i+1 was defined). Then I passed those two arrays to checkArrays where I compare each character and then return the index at which they stop being the same.

Example: apple ipad mini ... as an array is

["a", "p", "p", "l", "e", " ", "i", "p", "a", "d", " ", "m", "i", "n", "i", ...]

and apple ipad air... as an array is

["a", "p", "p", "l", "e", " ", "i", "p", "a", "d", " ", "a", "i", "r",...]

And the index in which they stop being similar is 11.

Then I need to find the index (for both) at which they are different, plus the next space because I want to ensure I slice the array at a whole word. So I look for the element that is a space AND has an index greater than the difference index.

I do this for both arrays as the indices will be different.

For ["a", "p", "p", "l", "e", " ", "i", "p", "a", "d", " ", "m", "i", "n", "i", ...] it is 15.

For ["a", "p", "p", "l", "e", " ", "i", "p", "a", "d", " ", "a", "i", "r", ...] it is 14.

There was a situation where I'd end up with like apple ipad m being pushed into the arr after apple ipad mini but that was because I need to test both indices for each array (since at the first loop apple ipad mini ... was the second word but in the second loop it was the first word). I compensated for this with these lines:

  if (!arr.includes(one.slice(0, index1).join("").trim()) 
      && !arr.includes(one.slice(0, index2).join("").trim())){
    arr.push(one.slice(0, index1).join("").trim())
  }

and

 if (!arr.includes(two.slice(0, index2).join("").trim())
     && !arr.includes(two.slice(0, index1).join("").trim())){
    arr.push(two.slice(0, index2).join("").trim())
  }

After we're done with that, I returned a new array using var newArr = [... new Set(arr)] to ensure that any repeated values were omitted. At this point you'd end up with an array like ["apple iphone", "apple ipad mini", "apple ipad air", "panasonic", "samsung s2", "samsung s3"]. These will be the keys in our object.

Finally, generateObject loops through the new array and essentially assigns the keys values of a filtered collection of items that include the key. So for the key apple ipad mini you would get a filtered collection of ["apple ipad mini 32gb", "apple ipad mini 64gb"]

Again, I think this solution needs refinement for efficiency's sake but I think it could help get you started at least logic wise.

Upvotes: 4

Related Questions