Reputation: 731
I'm trying to make Solr search phone numbers which are stored like this +79876543210
using a query like these:
+79876543210
79876543210
89876543210 <-- '+7' is replaced with region specific code '8'
9876543210 <-- '+7' entirely removed
This is just an example. Another one is wired line phone numbers:
+78662123456 <-- '+78662' is a specific region code
78662123456
88662123456
8662123456
123456 <-- region code entirely removed
One way I could manage this is using a separate field which is filled with these variants and used solely during search.
But this has issues with highlighting (it returns <em>123456</em>
to be highlighted whereas the real value shown to user is +78662123456
).
I thought that maybe it's best to make these indices using just Solr, but how?
First thought was to use managed synonyms filter and pass them along with each added record. But the docs explicitly states:
Changes made to managed resources via this REST API are not applied to the active Solr components until the Solr collection (or Solr core in single server mode) is reloaded.
So reloading a core every time after adding a record is not the way to go. Other issues involve keeping these synonyms up to date with records.
Could there be another way to solve this?
Upvotes: 0
Views: 89
Reputation: 731
Thanks to this comment (by MatsLindh) I've managed to assemble a simple filter based on bult-in EdgeNGramTokenFilter
:
package com.step4;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ReverseCustomFilter extends TokenFilter {
private static final PatternReplacementPair[] phonePatterns = {
new PatternReplacementPair("\\+7", "7"),
new PatternReplacementPair("\\+7", "8"),
new PatternReplacementPair("\\+7", ""),
new PatternReplacementPair("\\+78662", ""),
new PatternReplacementPair("\\+78663", ""),
};
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int curPatternIndex;
private int curPosIncr;
private State curState;
public ReverseCustomFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
while (true) {
if (curPatternIndex == 0) {
if (!input.incrementToken()) {
return false;
}
curState = captureState();
curPosIncr += posIncrAtt.getPositionIncrement();
curPatternIndex = 1;
}
if (curPatternIndex <= phonePatterns.length) {
PatternReplacementPair replacementPair = phonePatterns[curPatternIndex - 1];
curPatternIndex++;
restoreState(curState);
Matcher matcher = replacementPair.getPattern().matcher(termAtt);
if (matcher.find()) {
posIncrAtt.setPositionIncrement(curPosIncr);
curPosIncr = 0;
String replaced = matcher.replaceFirst(replacementPair.getReplacement());
termAtt.setEmpty().append(replaced);
return true;
}
}
else {
restoreState(curState);
posIncrAtt.setPositionIncrement(0);
curPatternIndex = 0;
return true;
}
}
}
@Override
public void reset() throws IOException {
super.reset();
curPatternIndex = 0;
curPosIncr = 0;
}
@Override
public void end() throws IOException {
super.end();
posIncrAtt.setPositionIncrement(curPosIncr);
}
private static class PatternReplacementPair {
private final Pattern pattern;
private final String replacement;
public PatternReplacementPair(String pattern, String replacement) {
this.pattern = Pattern.compile(pattern);
this.replacement = replacement;
}
public Pattern getPattern() {
return pattern;
}
public String getReplacement() {
return replacement;
}
}
}
Upvotes: 0