Reputation: 66
I'm new to lucene so I don't know if it is possible, but I have an index and I would like to get the total amount of phrases in a subset of the index(the subset is defined by a filter). I can use FilteredQuery with my Filter and a PhraseQuery to search for the phrase and thus I can count the documents in which this phrase occurs, but I can't seem to find a way to count the number of matches per document as well.
Upvotes: 2
Views: 969
Reputation: 25150
You can do this, see LUCENE-2590 for details.
For example code you can look at the unit tests for this feature.
I've copied the relevant code for phrase searchers below,
This is the collector,
private static class CountingCollector extends Collector {
private final Collector other;
private int docBase;
public final Map<Integer, Map<Query, Float>> docCounts = new HashMap<Integer, Map<Query, Float>>();
private final Map<Query, Scorer> subScorers = new HashMap<Query, Scorer>();
private final ScorerVisitor<Query, Query, Scorer> visitor = new MockScorerVisitor();
private final EnumSet<Occur> collect;
private class MockScorerVisitor extends ScorerVisitor<Query, Query, Scorer> {
@Override
public void visitOptional(Query parent, Query child, Scorer scorer) {
if (collect.contains(Occur.SHOULD))
subScorers.put(child, scorer);
}
@Override
public void visitProhibited(Query parent, Query child, Scorer scorer) {
if (collect.contains(Occur.MUST_NOT))
subScorers.put(child, scorer);
}
@Override
public void visitRequired(Query parent, Query child, Scorer scorer) {
if (collect.contains(Occur.MUST))
subScorers.put(child, scorer);
}
}
public CountingCollector(Collector other) {
this(other, EnumSet.allOf(Occur.class));
}
public CountingCollector(Collector other, EnumSet<Occur> collect) {
this.other = other;
this.collect = collect;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
other.setScorer(scorer);
scorer.visitScorers(visitor);
}
@Override
public void collect(int doc) throws IOException {
final Map<Query, Float> freqs = new HashMap<Query, Float>();
for (Map.Entry<Query, Scorer> ent : subScorers.entrySet()) {
Scorer value = ent.getValue();
int matchId = value.docID();
freqs.put(ent.getKey(), matchId == doc ? value.freq() : 0.0f);
}
docCounts.put(doc + docBase, freqs);
other.collect(doc);
}
@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
this.docBase = docBase;
other.setNextReader(reader, docBase);
}
@Override
public boolean acceptsDocsOutOfOrder() {
return other.acceptsDocsOutOfOrder();
}
}
The unit test is,
@Test
public void testPhraseQuery() throws Exception {
PhraseQuery q = new PhraseQuery();
q.add(new Term("f", "b"));
q.add(new Term("f", "c"));
CountingCollector c = new CountingCollector(TopScoreDocCollector.create(10,
true));
s.search(q, null, c);
final int maxDocs = s.maxDoc();
assertEquals(maxDocs, c.docCounts.size());
for (int i = 0; i < maxDocs; i++) {
Map<Query, Float> doc0 = c.docCounts.get(i);
assertEquals(1, doc0.size());
assertEquals(2.0F, doc0.get(q), FLOAT_TOLERANCE);
Map<Query, Float> doc1 = c.docCounts.get(++i);
assertEquals(1, doc1.size());
assertEquals(1.0F, doc1.get(q), FLOAT_TOLERANCE);
}
}
Upvotes: 3