Reputation: 8276
I have a problem to solve where given a string source
and a collection of search criteria criteria
, the algorithm has to return the shortest possible substring of source
that contains all items of criteria
.
=================================
UPDATE
hello world
==================================
String source = "aaa wwwww fgffsd ththththt sss sgsgsgsghs bfbfb hhh sdfg kkk dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss", "hhh" };
The input above should return the following substring: kkk wdwd aaa vcvc hhh zxzx sss
Unfortunately, I spent a lot of time trying to write such an algorithm but I couldn't get it just right. Below is the code I have got so far:
public struct Extraction
{
public int Start { get; set; }
public int End { get; set; }
public int Length
{
get
{
var length = this.End - this.Start;
return length;
}
}
public Extraction(int start, int end)
{
this.Start = start;
this.End = end;
}
}
public class TextExtractor
{
private String _source;
private Dictionary<String, List<Int32>> _criteriaIndexes;
private Dictionary<String, int> _entryIndex;
public TextExtractor(String source, List<String> searchCriteria)
{
this._source = source;
this._criteriaIndexes = this.ExtractIndexes(source, searchCriteria);
this._entryIndex = _criteriaIndexes.ToDictionary(x => x.Key, v => 0);
}
public String Extract()
{
List<Extraction> possibleExtractions = new List<Extraction>();
int index = 0;
int min = int.MaxValue;
int max = 0;
bool shouldStop = false;
while (index < _criteriaIndexes.Count && !shouldStop)
{
Boolean compareWithAll = index == _criteriaIndexes.Count - 1;
if (!compareWithAll)
{
var current = _criteriaIndexes.ElementAt(index);
this.CalculateMinMax(current, ref min, ref max);
index++;
}
else
{
var entry = _criteriaIndexes.Last();
while (_entryIndex[entry.Key] < entry.Value.Count)
{
int a = min;
int b = max;
this.CalculateMinMax(entry, ref a, ref b);
_entryIndex[entry.Key]++;
Extraction ext = new Extraction(a, b);
possibleExtractions.Add(ext);
}
int k = index - 1;
while (k >= 0)
{
var prev = _criteriaIndexes.ElementAt(k);
if (prev.Value.Count - 1 > _entryIndex[prev.Key])
{
_entryIndex[prev.Key]++;
break;
}
else
{
k--;
}
}
shouldStop = _criteriaIndexes.All(x => x.Value.Count - 1 <= _entryIndex[x.Key]);
_entryIndex[entry.Key] = 0;
index = 0;
min = int.MaxValue;
max = 0;
}
}
Extraction shortest = possibleExtractions.First(x => x.Length.Equals(possibleExtractions.Min(p => p.Length)));
String result = _source.Substring(shortest.Start, shortest.Length);
return result;
}
private Dictionary<String, List<Int32>> ExtractIndexes(String source, List<String> searchCriteria)
{
Dictionary<String, List<Int32>> result = new Dictionary<string, List<int>>();
foreach (var criteria in searchCriteria)
{
Int32 i = 0;
Int32 startingIndex = 0;
var indexes = new List<int>();
while (i > -1)
{
i = source.IndexOf(criteria, startingIndex);
if (i > -1)
{
startingIndex = i + 1;
indexes.Add(i);
}
}
if (indexes.Any())
{
result.Add(criteria, indexes);
}
}
return result;
}
private void CalculateMinMax(KeyValuePair<String, List<int>> current, ref int min, ref int max)
{
int j = current.Value[_entryIndex[current.Key]];
if (j < min)
{
min = j;
}
int indexPlusWordLength = j + current.Key.Length;
if (indexPlusWordLength > max)
{
max = indexPlusWordLength;
}
}
}
I would appreciate it if someone could point out where did I go wrong in my algorithm. Moreover, I kinda feel this is a very naive implementation. Maybe there is a better approach to solve this problem than trying to try out combinations of indexes?
Thanks!
Upvotes: 0
Views: 274
Reputation: 2650
This is a much simpler algorithm that will give you the shortest substring.
void Main()
{
String source = "aaa wwwww fgffsd ththththt sss ww sgsgsgsghs bfbfb hhh sdfg kkk " +
"dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss ww nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss ww", "hhh" };
var result = GetAllSubstringContainingCriteria(source, criteria)
.OrderBy(sub => sub.Length).FirstOrDefault();
// result is "kkk wdwd aaa vcvc hhh zxzx sss ww"
}
private IEnumerable<string> GetAllSubstringContainingCriteria(
string source, List<string> criteria)
{
for (int i = 0; i < source.Length; i++)
{
var subString = source.Substring(i);
if (criteria.Any(crit => subString.StartsWith(crit)))
{
var lastWordIndex =
GetLastCharacterIndexFromLastCriteriaInSubstring(subString, criteria);
if (lastWordIndex >= 0)
yield return string.Join(" ", subString.Substring(0, lastWordIndex));
}
else
continue;
}
}
private int GetLastCharacterIndexFromLastCriteriaInSubstring(
string subString, List<string> criteria)
{
var results = criteria.Select(crit => new {
index = subString.IndexOf(crit),
criteria = crit});
return results.All(result => result.index >= 0)
? results.Select(result => result.index + result.criteria.Length).Max()
: -1;
}
Upvotes: 2
Reputation:
Let the Java built-in classes do the work. How about converting your criteria to a regular expression Pattern. If the criteria are X or Y or Z . . ., convert this into a regular expression of the form "(X)|(Y)|(Z)|...", compile it, and execute it against the source string.
This, of course, returns the leftmost match. You could code a very straightforward loop that iterates across all occurrences, caches them, and chooses the shortest--or the leftmost shortest--or, if two or more are equally short, then all of those.
Upvotes: 0