Reputation: 2121
How to query inverted index list / collection if the keywords are dynamic (user can input as many as he / she wants)?
I am quite confused in building the "WHERE" clause, since the number of keywords is not a fixed amount.
Just in case if someone doesn't familiar with inverted index : http://en.wikipedia.org/wiki/Inverted_index
This is the class model of an index :
public class Index
{
public string word;
public List<int> referenceIDs;
//constructor
}
And this is the collection :
List<Index> invertedIndices = new List<Index>();
Thank you.
P.S. I prefer an answer in lambda expression, if possible, although any sql based language should be fine too.
EDITED :
Upvotes: 1
Views: 285
Reputation: 5863
var final = invertedIndices.Where(x => words.Contains(x.word))
.SelectMany(y => y.referenceIDs)
.GroupBy(z => z)
.Where(a => a.Count() == words.Count())
.Select(b => b.Key);
The unit test below demonstrates that this query retrieves only the expected results. I could have used a JOIN if I had converted the "words" list into a dictionary or some custom reference type. As it is, you can't join a list of strings with a list of reference types.
[TestMethod]
public void InvertedIndiciesSearchReturnsMatchOnAllKeywords()
{
var words = new List<string>() { "Cow", "Horse" };
var invertedIndices = new List<Index>()
{
new Index { word = "Pig", referenceIDs = new List<int>() { 1, 2, 8 }},
new Index { word = "Chicken", referenceIDs = new List<int>() { 4, 8 }},
new Index { word = "Horse", referenceIDs = new List<int>() { 1, 2, 8 }},
new Index { word = "Goat", referenceIDs = new List<int>() { 3 }},
new Index { word = "Cow", referenceIDs = new List<int>() { 1, 3 }},
new Index { word = "Coward", referenceIDs = new List<int>() { 999 }}
};
// Contains is searching for x.word _in the list_ "words", not searching
// to see if any of the _strings within words_ contains x.word.
var final = invertedIndices.Where(x => words.Contains(x.word))
.SelectMany(y => y.referenceIDs)
// now limit the results by getting only those reference IDs
// that appeared for every item in the input list
.GroupBy(z => z)
.Where(a => a.Count() == words.Count())
.Select(b => b.Key);
Assert.AreEqual(1, final.Count(), "result count");
Assert.AreEqual(1, final.First(), "value '1' is shared by Cow and Horse and should be the only result");
}
Upvotes: 1
Reputation: 6958
One way to do this would be to design your own collection. A Dictionary<string, List<int>>
would serve well as the underlying collection. This makes your lookups quite fast. Here's a partial implementation of such a class that shows how one such lookup would look like:
public class InvertedIndexCollection : IDictionary<string, List<int>>
{
public class IndexedWord
{
public string Key
{
get
{
return kvp.Key;
}
}
public List<int> Value
{
get
{
return kvp.Value;
}
}
private KeyValuePair<string, List<int>> kvp = new KeyValuePair<string, List<int>>();
public IndexedWord()
{
}
public IndexedWord(string _key, List<int> _newvalue)
{
kvp = new KeyValuePair<string, List<int>>(_key, _newvalue.OrderBy(x => x).ToList());
}
}
private Dictionary<string, List<int>> Collection = new Dictionary<string, List<int>>();
public int Count
{
get
{
return Collection.Count;
}
}
public InvertedIndexCollection()
{
}
public InvertedIndexCollection(Dictionary<string, List<int>> NewCollection)
{
Collection = NewCollection;
}
public List<int> this[string key]
{
get
{
return Collection[key];
}
set
{
Collection[key] = value;
}
}
public void Add(IndexedWord NewItem)
{
if(Collection.ContainsKey(NewItem.Key))
Collection[NewItem.Key].AddRange(NewItem.Value.Where(x => !Collection[NewItem.Key].Contains(x)));
else
Collection.Add(NewItem.Key, NewItem.Value);
}
public void Add(string Newkey, int Index)
{
if(Collection.ContainsKey(Newkey))
{
Collection[Newkey].Add(Index);
Collection[Newkey].Sort();
}
else
Collection.Add(Newkey, new List<int> { Index });
}
public List<int> FindIndices(string InputString, string Delimiter)
{
return FindIndices(InputString.Split(Delimiter.ToArray(), StringSplitOptions.RemoveEmptyEntries));
}
//This return a list of indices that contain all the search words. You will
//probably need to work out how to implement partial results, but this
//should give you a start
public List<int> FindIndices(IEnumerable<string> InputArray)
{
//Get a list of indices for each word
var templist = (from word in InputArray
where Collection.ContainsKey(word)
select Collection[word]);
//Flatten the lists and remove duplicates and return every index that is
//common to all the words.
return (from index in templist.SelectMany(x => x).Distinct()
where templist.All(x => x.Contains(index))
select index).ToList();
}
public void Add(string key, List<int> value)
{
Collection.Add(key, value);
}
public bool ContainsKey(string key)
{
return Collection.ContainsKey(key);
}
public ICollection<string> Keys
{
get
{
return Collection.Keys;
}
}
public bool Remove(string key)
{
return Collection.Remove(key);
}
public bool TryGetValue(string key, out List<int> value)
{
return Collection.TryGetValue(key, out value);
}
public ICollection<List<int>> Values
{
get
{
return Collection.Values;
}
}
public void Clear()
{
Collection.Clear();
}
public bool Contains(KeyValuePair<string, List<int>> item)
{
return Collection.Contains(item);
}
public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
{
return Collection.GetEnumerator();
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
return Collection.GetEnumerator();
}
public void Add(KeyValuePair<string, List<int>> item)
{
throw new NotImplementedException();
}
public void CopyTo(KeyValuePair<string, List<int>>[] array, int arrayIndex)
{
throw new NotImplementedException();
}
public bool IsReadOnly
{
get
{
throw new NotImplementedException();
}
}
public bool Remove(KeyValuePair<string, List<int>> item)
{
throw new NotImplementedException();
}
}
Upvotes: 0