Reputation: 1273
I m trying to create new tokenizer by refering book tamingtext (which uses lucene 3.+ api) using new lucene api 7.3, but it is giving me error as mentioned below
java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.
at org.apache.lucene.analysis.Tokenizer$1.read(Tokenizer.java:109)
at java.io.Reader.read(Reader.java:140)
at solr.SentenceTokenizer.fillSentences(SentenceTokenizer.java:43)
at solr.SentenceTokenizer.incrementToken(SentenceTokenizer.java:55)
at solr.NameFilter.fillSpans(NameFilter.java:56)
at solr.NameFilter.incrementToken(NameFilter.java:88)
at spec.solr.NameFilterTest.testNameFilter(NameFilterTest.java:81)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
Here is my SentenceTokenizer
class
Initializing method, in older api there was super(Reader); but in current api there is no access to Reader
class
public SentenceTokenizer(SentenceDetector detector) {
super();
setReader(reader);
this.sentenceDetector = detector;
}
Here is my reset method
@Override
public void reset() throws IOException {
super.reset();
sentenceDetector = null;
}
When i tried to access this method from custom TokenFilter, I m getting above error
public void fillSentences() throws IOException {
char[] c = new char[256];
int size = 0;
StringBuilder stringBuilder = new StringBuilder();
while ((size = input.read(c)) >= 0) {
stringBuilder.append(c, 0, size);
}
String temp = stringBuilder.toString();
inputSentence = temp.toCharArray();
sentenceSpans = sentenceDetector.sentPosDetect(temp);
tokenOffset = 0;
}
@Override
public final boolean incrementToken() throws IOException {
if (sentenceSpans == null) {
//invoking following method
fillSentences();
}
if (tokenOffset == sentenceSpans.length) {
return false;
}
Span sentenceSpan = sentenceSpans[tokenOffset];
clearAttributes();
int start = sentenceSpan.getStart();
int end = sentenceSpan.getEnd();
charTermAttribute.copyBuffer(inputSentence, start, end - start);
positionIncrementAttribute.setPositionIncrement(1);
offsetAttribute.setOffset(start, end);
tokenOffset++;
return true;
}
Here is my custom TokenFilter class
public final class NameFilter extends TokenFilter {
public static final String NE_PREFIX = "NE_";
private final Tokenizer tokenizer;
private final String[] tokenTypeNames;
private final NameFinderME[] nameFinderME;
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private String text;
private int baseOffset;
private Span[] spans;
private String[] tokens;
private Span[][] foundNames;
private boolean[][] tokenTypes;
private int spanOffsets = 0;
private final Queue<AttributeSource.State> tokenQueue =
new LinkedList<>();
public NameFilter(TokenStream in, String[] modelNames, NameFinderME[] nameFinderME) {
super(in);
this.tokenizer = SimpleTokenizer.INSTANCE;
this.nameFinderME = nameFinderME;
this.tokenTypeNames = new String[modelNames.length];
for (int i = 0; i < modelNames.length; i++) {
this.tokenTypeNames[i] = NE_PREFIX + modelNames[i];
}
}
//consumes tokens from the upstream tokenizer and buffer them in a
//StringBuilder whose contents will be passed to opennlp
protected boolean fillSpans() throws IOException {
if (!this.input.incrementToken()) return false;
//process the next sentence from the upstream tokenizer
this.text = input.getAttribute(CharTermAttribute.class).toString();
this.baseOffset = this.input.getAttribute(OffsetAttribute.class).startOffset();
this.spans = this.tokenizer.tokenizePos(text);
this.tokens = Span.spansToStrings(spans, text);
this.foundNames = new Span[this.nameFinderME.length][];
for (int i = 0; i < nameFinderME.length; i++) {
this.foundNames[i] = nameFinderME[i].find(tokens);
}
//insize
this.tokenTypes = new boolean[this.tokens.length][this.nameFinderME.length];
for (int i = 0; i < nameFinderME.length; i++) {
Span[] spans = foundNames[i];
for (int j = 0; j < spans.length; j++) {
int start = spans[j].getStart();
int end = spans[j].getEnd();
for (int k = start; k < end; k++) {
this.tokenTypes[k][i] = true;
}
}
}
spanOffsets = 0;
return true;
}
@Override
public boolean incrementToken() throws IOException {
//if there's nothing in the queue
if(tokenQueue.peek()==null){
//no span or spans consumed
if (spans==null||spanOffsets>=spans.length){
if (!fillSpans())return false;
}
if (spanOffsets>=spans.length)return false;
//copy the token and any types
clearAttributes();
keywordAttribute.setKeyword(false);
positionIncrementAttribute.setPositionIncrement(1);
int startOffset = baseOffset +spans[spanOffsets].getStart();
int endOffset = baseOffset+spans[spanOffsets].getEnd();
offsetAttribute.setOffset(startOffset,endOffset);
charTermAttribute.setEmpty()
.append(tokens[spanOffsets]);
//determine of the current token is of a named entity type, if so
//push the current state into the queue and add a token reflecting
// any matching entity types.
boolean [] types = tokenTypes[spanOffsets];
for (int i = 0; i < nameFinderME.length; i++) {
if (types[i]){
keywordAttribute.setKeyword(true);
positionIncrementAttribute.setPositionIncrement(0);
tokenQueue.add(captureState());
positionIncrementAttribute.setPositionIncrement(1);
charTermAttribute.setEmpty().append(tokenTypeNames[i]);
}
}
}
spanOffsets++;
return true;
}
@Override
public void close() throws IOException {
super.close();
reset();
}
@Override
public void reset() throws IOException {
super.reset();
this.spanOffsets = 0;
this.spans = null;
}
@Override
public void end() throws IOException {
super.end();
reset();
}
}
here is my test case for following class
@Test
public void testNameFilter() throws IOException {
Reader in = new StringReader(input);
Tokenizer tokenizer = new SentenceTokenizer( detector);
tokenizer.reset();
NameFilter nameFilter = new NameFilter(tokenizer, modelName, nameFinderMES);
nameFilter.reset();
CharTermAttribute charTermAttribute;
PositionIncrementAttribute positionIncrementAttribute;
OffsetAttribute offsetAttribute;
int pass = 0;
while (pass < 2) {
int pos = 0;
int lastStart = 0;
int lastEnd = 0;
//error occur on below invoke
while (nameFilter.incrementToken()) {
}
}
Upvotes: 0
Views: 214
Reputation: 1273
I have added following changes in my code and it work fine but i m now sure it is correct answer
public SentenceTokenizer(Reader reader,SentenceDetector sentenceDetector) {
super();
this.input =reader;
this.sentenceDetector = sentenceDetector;
}
Upvotes: 1