Reputation: 1
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document: (The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
Upvotes: 0
Views: 34