Bibek Shakya
Bibek Shakya

Reputation: 1273

error is occuring while creating custom tokenizer in lucene 7.3

I m trying to create new tokenizer by refering book tamingtext (which uses lucene 3.+ api) using new lucene api 7.3, but it is giving me error as mentioned below

java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.

at org.apache.lucene.analysis.Tokenizer$1.read(Tokenizer.java:109)
at java.io.Reader.read(Reader.java:140)
at solr.SentenceTokenizer.fillSentences(SentenceTokenizer.java:43)
at solr.SentenceTokenizer.incrementToken(SentenceTokenizer.java:55)
at solr.NameFilter.fillSpans(NameFilter.java:56)
at solr.NameFilter.incrementToken(NameFilter.java:88)
at spec.solr.NameFilterTest.testNameFilter(NameFilterTest.java:81)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

Here is my SentenceTokenizer class

Initializing method, in older api there was super(Reader); but in current api there is no access to Reader class

 public SentenceTokenizer(SentenceDetector detector) {
    super();
    setReader(reader);
    this.sentenceDetector = detector;
 }

Here is my reset method

@Override
public void reset() throws IOException {
    super.reset();
    sentenceDetector = null;
}

When i tried to access this method from custom TokenFilter, I m getting above error

public void fillSentences() throws IOException {
    char[] c = new char[256];
    int size = 0;
    StringBuilder stringBuilder = new StringBuilder();
    while ((size = input.read(c)) >= 0) {
        stringBuilder.append(c, 0, size);
    }
    String temp = stringBuilder.toString();
    inputSentence = temp.toCharArray();
    sentenceSpans = sentenceDetector.sentPosDetect(temp);
    tokenOffset = 0;
}



@Override
public final boolean incrementToken() throws IOException {
    if (sentenceSpans == null) {
        //invoking following method
        fillSentences();
    }
    if (tokenOffset == sentenceSpans.length) {
        return false;
    }
    Span sentenceSpan = sentenceSpans[tokenOffset];
    clearAttributes();
    int start = sentenceSpan.getStart();
    int end = sentenceSpan.getEnd();
    charTermAttribute.copyBuffer(inputSentence, start, end - start);
    positionIncrementAttribute.setPositionIncrement(1);
    offsetAttribute.setOffset(start, end);
    tokenOffset++;
    return true;
}

Here is my custom TokenFilter class

public final class NameFilter extends TokenFilter {
public static final String NE_PREFIX = "NE_";
private final Tokenizer tokenizer;
private final String[] tokenTypeNames;
private final NameFinderME[] nameFinderME;
private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);

private String text;
private int baseOffset;

private Span[] spans;
private String[] tokens;
private Span[][] foundNames;

private boolean[][] tokenTypes;

private int spanOffsets = 0;
private final Queue<AttributeSource.State> tokenQueue =
        new LinkedList<>();

public NameFilter(TokenStream in, String[] modelNames, NameFinderME[] nameFinderME) {
    super(in);
    this.tokenizer = SimpleTokenizer.INSTANCE;
    this.nameFinderME = nameFinderME;
    this.tokenTypeNames = new String[modelNames.length];
    for (int i = 0; i < modelNames.length; i++) {
        this.tokenTypeNames[i] = NE_PREFIX + modelNames[i];
    }
}

//consumes tokens from the upstream tokenizer and buffer them in a
//StringBuilder whose contents will be passed to opennlp
protected boolean fillSpans() throws IOException {
    if (!this.input.incrementToken()) return false;
    //process the next sentence from the upstream tokenizer
    this.text = input.getAttribute(CharTermAttribute.class).toString();
    this.baseOffset = this.input.getAttribute(OffsetAttribute.class).startOffset();
    this.spans = this.tokenizer.tokenizePos(text);
    this.tokens = Span.spansToStrings(spans, text);
    this.foundNames = new Span[this.nameFinderME.length][];
    for (int i = 0; i < nameFinderME.length; i++) {
        this.foundNames[i] = nameFinderME[i].find(tokens);
    }
    //insize
    this.tokenTypes = new boolean[this.tokens.length][this.nameFinderME.length];
    for (int i = 0; i < nameFinderME.length; i++) {
        Span[] spans = foundNames[i];
        for (int j = 0; j < spans.length; j++) {
            int start = spans[j].getStart();
            int end = spans[j].getEnd();
            for (int k = start; k < end; k++) {
                this.tokenTypes[k][i] = true;
            }
        }
    }
    spanOffsets = 0;
    return true;
}

@Override
public boolean incrementToken() throws IOException {
    //if there's nothing in the queue
    if(tokenQueue.peek()==null){
        //no span or spans consumed
        if (spans==null||spanOffsets>=spans.length){
            if (!fillSpans())return false;
        }
        if (spanOffsets>=spans.length)return false;
        //copy the token and any types
        clearAttributes();
        keywordAttribute.setKeyword(false);
        positionIncrementAttribute.setPositionIncrement(1);
        int startOffset = baseOffset +spans[spanOffsets].getStart();
        int endOffset = baseOffset+spans[spanOffsets].getEnd();
        offsetAttribute.setOffset(startOffset,endOffset);
        charTermAttribute.setEmpty()
                .append(tokens[spanOffsets]);
        //determine of the current token is of a named entity type, if so
        //push the current state into the queue and add a token reflecting
        // any matching entity types.
        boolean [] types = tokenTypes[spanOffsets];
        for (int i = 0; i < nameFinderME.length; i++) {
            if (types[i]){
                keywordAttribute.setKeyword(true);
                positionIncrementAttribute.setPositionIncrement(0);
                tokenQueue.add(captureState());
                positionIncrementAttribute.setPositionIncrement(1);
                charTermAttribute.setEmpty().append(tokenTypeNames[i]);
            }
        }
    }
    spanOffsets++;
    return true;
}

@Override
public void close() throws IOException {
    super.close();
    reset();
}
@Override
public void reset() throws IOException {
    super.reset();
    this.spanOffsets = 0;
    this.spans = null;
}

@Override
public void end() throws IOException {
    super.end();
    reset();
}
}

here is my test case for following class

@Test
public void testNameFilter() throws IOException {
    Reader in = new StringReader(input);
    Tokenizer tokenizer = new SentenceTokenizer( detector);
    tokenizer.reset();
    NameFilter nameFilter = new NameFilter(tokenizer, modelName, nameFinderMES);
    nameFilter.reset();
    CharTermAttribute charTermAttribute;
    PositionIncrementAttribute positionIncrementAttribute;
    OffsetAttribute offsetAttribute;
    int pass = 0;
    while (pass < 2) {
        int pos = 0;
        int lastStart = 0;
        int lastEnd = 0;
        //error occur on below invoke
        while (nameFilter.incrementToken()) {

    }
}

Upvotes: 0

Views: 214

Answers (1)

Bibek Shakya
Bibek Shakya

Reputation: 1273

I have added following changes in my code and it work fine but i m now sure it is correct answer

public SentenceTokenizer(Reader reader,SentenceDetector sentenceDetector) {
    super();
    this.input =reader;
    this.sentenceDetector = sentenceDetector;
}

Upvotes: 1

Related Questions