Siddhanjay Godre
Siddhanjay Godre

Reputation: 151

Simple string compression: removing consecutive repeating substrings

I was asked this question recently in an Amazon interview .

Given a string, remove consecutive repeating substrings from it. If there are multiple consecutive intersecting substrings, remove the largest of those.

To make it clear, here are some examples:

(You can compress 'ab' or 'abcd', but as length of 'abcd' is larger you prefer compressing the larger one.)

I could not come up with an efficient implementation, anyone knows a good approach to this?

As this was an interview question, please refrain from using complicated library functions.

Upvotes: 7

Views: 6283

Answers (4)

Amit
Amit

Reputation: 46361

Without regex... This recursive method works:

var cases = ['aabcccddeaaa', 'abababcdeee', 'ababcdabcd'];

function compress(str) {
  var len, sub, i, n;

  // if str is shorter than 2, can't be any repeating substrings
  if(str.length < 2)
    return str;

  // max meaningful length is str.length/2 (truncated to integer)
  for(len = (str.length / 2) | 0; len > 0; len--) {
    // search for a repeating substring of "len" length starting at index i
    for(i = 0; i + (len * 2) <= str.length; i++) {
      sub = str.substr(i, len);
      // if such a substring exists...
      if(str.indexOf(sub, i + len) == i + len) {
        // "count" how many occurences (advance index till beyond repeats)
        for(n = i + len * 2; str.indexOf(sub, n) == n; n += len);
        // return a string composed of the compressed part before the match +
        // the substring + the compressed part beyond the match
        return compress(str.substr(0, i)) + sub + compress(str.substr(n));
      }
    }
  }

  // if nothing found, return original string
  return str;
}

alert(JSON.stringify(cases.map(compress)));

After the long debate in the comments regarding the complexity of the algorithm, I decided to refactor a bit and use a self implemented startsWith function, and utilize that to count inner operations (complexity...).

I took the chance to further optimize by minimizing string allocations to a minimum so now the recursion works with the entire string + start/end indexes.

The code below generates an output that includes the input string, result, n^2 (for O(n^2) comparison) and actual inner operation count. I added a few edge cases to show how it performs. I couldn't find an input that results in n^2 count, they were all below.

var cases = ['aabcccddeaaa', 'abababcdeee', 'ababcdabcd',
             'aabaaabaab', '1', '111222', '123456789', '1234567899'];

var innerCount;

function startsWith(str, start, subStart, subLen) {
  var subEnd = subStart + subLen - 1;
  while(subStart <= subEnd) {
    innerCount++;
    if(str[start++] != str[subStart++])
      return false;
  }
  return true;
}

function doCompress(str, maxLen, minIndex, maxIndex) {
  var len, sub, i, n;

  // if str is shorter than 2, can't be any repeating substrings
  if(maxIndex - minIndex + 1 < 2)
    return str.substring(minIndex, maxIndex + 1);

  for(len = maxLen; len > 0; len--) {
    // search for a repeating substring of "len" length starting at index i
    for(i = minIndex; i + (len * 2) <= maxIndex + 1; i++) {
      // if such a substring exists...
      if(startsWith(str, i + len, i, len)) {
        // "count" how many occurences (advance index till beyond repeats)
        for(n = i + len * 2; (n + len <= maxIndex + 1) && startsWith(str, n, i, len); n += len);
        // return a string composed of the compressed part before the match +
        // the substring + the compressed part beyond the match
        return (i > minIndex ? doCompress(str, len - 1, minIndex, i - 1) : '') +
          str.substr(i, len) +
          (n < maxIndex ? doCompress(str, len, n, maxIndex) : '');
      }
    }
  }

  // if nothing found, return original string
  return str.substring(minIndex, maxIndex + 1);
}

function compress(str) {
  innerCount = 0;
  // max meaningful length is str.length/2 (truncated to integer)
  return {
    source: str,
    result: doCompress(str, (str.length / 2) | 0, 0, str.length - 1),
    'n^2': str.length*str.length,
    innerCount: innerCount};
}

alert(JSON.stringify(cases.map(compress), null, '\t'));

This solution has a time complexity of O(n^2).

Upvotes: 1

Pham Trung
Pham Trung

Reputation: 11294

For a string X, we can find the largest consecutive repeating substring in O(n^2) using Z-algorithm which Given a string S of length n, the Z Algorithm produces an array Z where Z[i] is the length of the longest substring starting from pat[i] which is also a prefix of pat (Source)

For each suffix of X start at i, applying Z-algorithm for this substring:

int result = 0;
for(int i = 0; i < X.length(); i++)
   int[]z = Z-algo(X.substring(i)); //this take O(n)
   for(int j = i + result + 1; j < X.length(); j++)
       if(z[j] >= j - i + 1)
          result = (j - i + 1);

Repeating the above procedure until we cannot find any repeating substring, we can obtain a O(n^3) algorithm.

Note: after reread the question, especially the last example, I figure out that valid repeating substrings are only limited from the original's substrings. So, the time complexity can be reduced to O(n^2 log n) by using a max heap.

Upvotes: 2

vks
vks

Reputation: 67988

pos=[]
dstr={}
final=[]
x="ababcdabcdcde"

for k in re.finditer(r"(?=(.+?)\1+)",x):        #Find start of all overlapping strings
    pos.append(k.start())
i=0
for k in pos: #Find end of overlapping strings
    s=re.findall(r"^((.*)\2+)",x[k:])
    dstr[i]=(k,len(s[0][0]))
    i=i+1
#print dstr.values()
k=0
while k< len(dstr.values())-1:           #remove smaller length overlapping result
    if dstr.values()[k+1][0]<dstr.values()[k][1]<dstr.values()[k+1][1]:
        pass
    else:
        final.append(dstr.values()[k][0])
    k=k+1
if dstr.values()[k-1][0] in final:
    pass
else:
    final.append(dstr.values()[k][0])
#print final
for k in final:             #remove strings
    s=re.sub(r"(.*)\1+",r"\1",x[k:])
    x=x[:k]+s
print x

This is in python.Works fine with given input.

Upvotes: 1

user1952500
user1952500

Reputation: 6781

There is a straightforward non-recursive O(n^3). The key observation is this: suppose there is a string 'aabcbcabbc', if we are only to remove consecutive repetitions, as long as we reduce strings of length=1 first, length=2 second, and so on, we can reduce it and the reduction will be optimal. Hence

'aabcabbc' => 'abcbcabc' => 'abcabc' => 'abc'

Python code:

def strcompress(str):
   strlen = len(str)
   for size in range (1, strlen // 2):
      for i in range (0, strlen - 2 * size + 1):
            str1 = str[i:i+size]
            str2 = str[i+size:i+2*size]
            while str1 == str2:
               str = str[:i+size] + str[i+2*size:]
               strlen = len(str)
               if i + 2*size > strlen:
                  break
               str2 = str[i+size:i+2*size]
   print("The compressed string is:" + str)   
   return

Example:

>>> strcompress("ababcdabcd")
The compressed string is:abcd

EDIT: Fixed a few bugs in code. This should work for existing samples and the example I provided.

Upvotes: 0

Related Questions