Kassem
Kassem

Reputation: 8276

Substract shortest string containing all search criteria

I have a problem to solve where given a string source and a collection of search criteria criteria, the algorithm has to return the shortest possible substring of source that contains all items of criteria.

=================================

UPDATE

==================================

String source = "aaa wwwww fgffsd ththththt sss sgsgsgsghs bfbfb hhh sdfg kkk dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss", "hhh" };

The input above should return the following substring: kkk wdwd aaa vcvc hhh zxzx sss

Unfortunately, I spent a lot of time trying to write such an algorithm but I couldn't get it just right. Below is the code I have got so far:

public struct Extraction
{
    public int Start { get; set; }
    public int End { get; set; }
    public int Length
    {
        get
        {
            var length = this.End - this.Start;
            return length;
        }
    }

    public Extraction(int start, int end)
    {
        this.Start = start;
        this.End = end;
    }
}

public class TextExtractor
{
    private String _source;
    private Dictionary<String, List<Int32>> _criteriaIndexes;
    private Dictionary<String, int> _entryIndex;

    public TextExtractor(String source, List<String> searchCriteria)
    {
        this._source = source;
        this._criteriaIndexes = this.ExtractIndexes(source, searchCriteria);
        this._entryIndex = _criteriaIndexes.ToDictionary(x => x.Key, v => 0);
    }

    public String Extract()
    {
        List<Extraction> possibleExtractions = new List<Extraction>();

        int index = 0;
        int min = int.MaxValue;
        int max = 0;
        bool shouldStop = false;
        while (index < _criteriaIndexes.Count && !shouldStop)
        {
            Boolean compareWithAll = index == _criteriaIndexes.Count - 1;
            if (!compareWithAll)
            {
                var current = _criteriaIndexes.ElementAt(index);
                this.CalculateMinMax(current, ref min, ref max);
                index++;
            }
            else
            {
                var entry = _criteriaIndexes.Last();
                while (_entryIndex[entry.Key] < entry.Value.Count)
                {
                    int a = min;
                    int b = max;
                    this.CalculateMinMax(entry, ref a, ref b);

                    _entryIndex[entry.Key]++;
                    Extraction ext = new Extraction(a, b);
                    possibleExtractions.Add(ext);
                }
                int k = index - 1;

                while (k >= 0)
                {
                    var prev = _criteriaIndexes.ElementAt(k);
                    if (prev.Value.Count - 1 > _entryIndex[prev.Key])
                    {
                        _entryIndex[prev.Key]++;
                        break;
                    }
                    else
                    {
                        k--;
                    }
                }
                shouldStop = _criteriaIndexes.All(x => x.Value.Count - 1 <= _entryIndex[x.Key]);
                _entryIndex[entry.Key] = 0;
                index = 0;
                min = int.MaxValue;
                max = 0;
            }
        }

        Extraction shortest = possibleExtractions.First(x => x.Length.Equals(possibleExtractions.Min(p => p.Length)));
        String result = _source.Substring(shortest.Start, shortest.Length);
        return result;
    }

    private Dictionary<String, List<Int32>> ExtractIndexes(String source, List<String> searchCriteria)
    {
        Dictionary<String, List<Int32>> result = new Dictionary<string, List<int>>();
        foreach (var criteria in searchCriteria)
        {
            Int32 i = 0;
            Int32 startingIndex = 0;
            var indexes = new List<int>();
            while (i > -1)
            {
                i = source.IndexOf(criteria, startingIndex);
                if (i > -1)
                {
                    startingIndex = i + 1;
                    indexes.Add(i);
                }
            }
            if (indexes.Any())
            {
                result.Add(criteria, indexes);
            }

        }
        return result;
    }

    private void CalculateMinMax(KeyValuePair<String, List<int>> current, ref int min, ref int max)
    {
        int j = current.Value[_entryIndex[current.Key]];
        if (j < min)
        {
            min = j;
        }
        int indexPlusWordLength = j + current.Key.Length;
        if (indexPlusWordLength > max)
        {
            max = indexPlusWordLength;
        }
    }
}

I would appreciate it if someone could point out where did I go wrong in my algorithm. Moreover, I kinda feel this is a very naive implementation. Maybe there is a better approach to solve this problem than trying to try out combinations of indexes?

Thanks!

Upvotes: 0

Views: 274

Answers (2)

Kinetic
Kinetic

Reputation: 2650

This is a much simpler algorithm that will give you the shortest substring.

void Main()
{
    String source = "aaa wwwww fgffsd ththththt sss ww sgsgsgsghs bfbfb hhh sdfg kkk " +
        "dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss ww nbnbn";
    List<String> criteria = new List<string> { "kkk", "aaa", "sss ww", "hhh" };
    var result = GetAllSubstringContainingCriteria(source, criteria)
        .OrderBy(sub => sub.Length).FirstOrDefault();
    // result is "kkk wdwd aaa vcvc hhh zxzx sss ww"
}

private IEnumerable<string> GetAllSubstringContainingCriteria(
    string source, List<string> criteria)
{
    for (int i = 0; i < source.Length; i++)
    {
        var subString = source.Substring(i);
        if (criteria.Any(crit => subString.StartsWith(crit)))
        {
            var lastWordIndex = 
                GetLastCharacterIndexFromLastCriteriaInSubstring(subString, criteria);
            if (lastWordIndex >= 0)
                yield return string.Join(" ", subString.Substring(0, lastWordIndex));
        }
        else
            continue;
    }
}

private int GetLastCharacterIndexFromLastCriteriaInSubstring(
    string subString, List<string> criteria)
{
    var results = criteria.Select(crit => new { 
            index = subString.IndexOf(crit),
            criteria = crit});

    return results.All(result => result.index >= 0)
        ? results.Select(result => result.index + result.criteria.Length).Max()
        : -1;
}

Upvotes: 2

user6515422
user6515422

Reputation:

Let the Java built-in classes do the work. How about converting your criteria to a regular expression Pattern. If the criteria are X or Y or Z . . ., convert this into a regular expression of the form "(X)|(Y)|(Z)|...", compile it, and execute it against the source string.

This, of course, returns the leftmost match. You could code a very straightforward loop that iterates across all occurrences, caches them, and chooses the shortest--or the leftmost shortest--or, if two or more are equally short, then all of those.

Upvotes: 0

Related Questions