Reputation: 360
I'm writing a basic Lucene.Net application to index what are essentially forum posts. To simplify, each Post
document has a URL
and some Content
. For each given thread I'm indexing each Post
as a separate document (indexing whole threads as single documents returns too many false positives when searching).
The problem I'm having is dealing with multiple Post
documents having the same URL
in my results sets. When I search and return 10 results, I want each result to refer to a different URL
.
Currently, I have something along the lines of the following:
// setup
StandardAnalyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
FSDirectory directory = FSDirectory.Open(indexLocation);
IndexSearcher searcher = new IndexSearcher(directory);
parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "body", analyzer);
// search
Query query = parser.Parse(queryString);
TopDocs topDocs = searcher.Search(query, null, 10);
However, of the ten results returned, there may only be 7 unique URLs
. I've looked at discarding these duplicates before searching again, returning a larger results set and discarding the first 10 (similar to pagination) until I have 10 unique URLs
, but this raises questions such as when should I stop because there are no more results? etc.
It feels like there should be a way of filtering at the TopDocs topDocs = searcher.Search()
point, to return 10 results which have unique URLs
. I can't find anything around this (perhaps I'm not using the correct terminology) but I'm sure a lot of other applications must have solved this before... Does anything like this already exist, or can anyone offer pointers as to how to go about implementing this?
Upvotes: 2
Views: 688
Reputation: 76
i have developed such functionlaity as extra filter for lucene 2.9.x and right now i have found that it shouldb e rewritten fully for 4.8
so in case when you are using 2.9.x version there is solution: your own hit collector, in method collect should also check that document is in unique documents documents set (bit array). bit array has to be constructed ones and cached (source):
public class DistinctValuesFilter
{
#region ctor
public DistinctValuesFilter(IndexReader searchReader, ISearchRequest request, int docLength, Analyzer anlzr)
{
this.distinctBy = StringHelper.Intern(request.DistinctBy);
this.processedMask = new OpenBitSetDISI(docLength);
FindDuplicateTermsDirectly(searchReader);
iireader = searchReader;
}
/// <summary>
/// initialize: duplicates hashset and array of possitions where duplicates are situated
/// code partially has been takend from lucene: Lucene.Net.Search.FieldCacheImpl.StringIndexCache
/// protected internal override object CreateValue(IndexReader reader, Entry entryKey)
/// </summary>
/// <param name="ireader">index reader to process duplicates</param>
private void FindDuplicateTermsDirectly(IndexReader ireader)
{
var maxlemgth = ireader.MaxDoc();
duplicates = new HashSet<int>(maxlemgth);
duplicatesLocations = new int[maxlemgth];
var termEnum = ireader.Terms(new Term(this.distinctBy));
var termDocs = ireader.TermDocs();
int num = 0;
int k, p;
int firstDocNo = -1;
try
{
do
{
var term = termEnum.Term();
if (term == null || term.Field() != this.distinctBy || num >= maxlemgth)
break;
termDocs.Seek(termEnum);
p = 0;
while (termDocs.Next())
{
k = termDocs.Doc();
duplicatesLocations[k] = num + 1;//0- needs to indicate that records/document completelly empty
if (p > 0) duplicates.Add(firstDocNo);
firstDocNo = k;
p++;
}
if (p > 1) duplicates.Add(firstDocNo);
num++;
}
while (termEnum.Next());
}
finally
{
termDocs.Close();
termEnum.Close();
}
}
protected IndexReader iireader;
protected string distinctBy;
private HashSet<int> duplicates;
private int[] duplicatesLocations;
private OpenBitSetDISI processedMask;
#endregion
private Lazy<string[]> ivalue => new Lazy<string[]>(() => FieldCache_Fields.DEFAULT.GetStrings(iireader, distinctBy));
public bool IsDistinct(int docIndex)
{
if (this.processedMask.FastGet(docIndex)) return false;
if (duplicatesLocations[docIndex] == 0) return false;//when value doesn't exist completelly
if (!duplicates.Contains(docIndex)) return true;
var dval = duplicatesLocations[docIndex];
var v = ivalue.Value;
var xv = string.Empty;
for (int i = 0; i < duplicatesLocations.Length; i++)
{
if (duplicatesLocations[i] == dval)
{
this.processedMask.FastSet(i);
if (!string.IsNullOrEmpty(xv) && xv != v[i])
{
throw new NotSupportedException($"values are not same ({i}): [{xv}] != [{v[i]}]");
}
xv = v[i];
}
}
return true;
}
}
Upvotes: 0