Reputation: 123
I have updated the Indexer and Searcher examples from the Lucene in Action 2nd edition book.
Indexer works ok but Searcher does not work.
I have indexed a bunch of txt files (Indexer filters txt files).
When I search with the Searcher class using a word I'm sure the txt files contain (it can be verified with grep) it finds 0 matching document.
There must be a problem with the code.
Here are the files pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.learning</groupId>
<artifactId>lucenebook</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>6.6.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
Indexer:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0]; //1
String dataDir = args[1]; //2
long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
writer = new IndexWriter(dir, new IndexWriterConfig()); //3
}
public void close() throws IOException {
writer.close(); //4
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f), new FieldType())); //7
FieldType notAnalyzed = new FieldType();
notAnalyzed.setTokenized(false);
notAnalyzed.setStored(true);
doc.add(new Field("filename", f.getName(), notAnalyzed //8
));//8
doc.add(new Field("fullpath", f.getCanonicalPath(), //9
notAnalyzed));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10
}
}
/*
#1 Create index in this directory
#2 Index *.txt files from this directory
#3 Create Lucene IndexWriter
#4 Close IndexWriter
#5 Return number of documents indexed
#6 Index .txt files only, using FileFilter
#7 Index file content
#8 Index file name
#9 Index file full path
#10 Add document to Lucene index
*/
and Searcher:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0]; //1
String q = args[1]; //2
search(indexDir, q);
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //3
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(directoryReader); //3
QueryParser parser = new QueryParser( // 4
"f", //4
new StandardAnalyzer( )); //4
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10); //5
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("fullpath")); //8
}
//9
}
}
/*
#1 Parse provided index directory
#2 Parse provided query string
#3 Open index
#4 Parse query
#5 Search index
#6 Write search stats
#7 Retrieve matching document
#8 Display filename
#9 Close IndexSearcher
*/
Upvotes: 0
Views: 751
Reputation: 123
The problem was that the Searcher was not using the field that is indexed, which is named "contents", it was using "f" and it was matching nothing. Here is the corrected code, Searcher only changes "f" to "contents" and Indexer uses a TextField and better captures exceptions (the changes to Indexer are suggested by the answer submitted by femtoRgon). Note that it is better to use the current Lucene demo, as femtoRgon suggests, in fact comparing it with the code was the path to the solution. Corrected Indexer.java:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Indexer {
public static void main(String[] args) {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0]; //1
String dataDir = args[1];//2
Indexer indexer = null;
long start = System.currentTimeMillis();
int numIndexed = 0;
try {
indexer = new Indexer(indexDir);
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} catch(Exception e) {
e.printStackTrace();
} finally {
if (indexer != null)
try {
indexer.close();
} catch (IOException e) {
// ignored
}
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
writer = new IndexWriter(dir, new IndexWriterConfig()); //3
}
public void close() throws IOException {
writer.close(); //4
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new TextField("contents", new FileReader(f))); //7
FieldType notAnalyzed = new FieldType();
notAnalyzed.setTokenized(false);
notAnalyzed.setStored(true);
doc.add(new Field("filename", f.getName(), notAnalyzed //8
));//8
doc.add(new Field("fullpath", f.getCanonicalPath(), //9
notAnalyzed));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10
}
}
/*
#1 Create index in this directory
#2 Index *.txt files from this directory
#3 Create Lucene IndexWriter
#4 Close IndexWriter
#5 Return number of documents indexed
#6 Index .txt files only, using FileFilter
#7 Index file content
#8 Index file name
#9 Index file full path
#10 Add document to Lucene index
*/
and Searcher.java:
package lia.meetlucene;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;
// From chapter 1
/**
* This code was originally written for
* Erik's Lucene intro java.net article
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0]; //1
String q = args[1]; //2
search(indexDir, q);
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //3
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher is = new IndexSearcher(directoryReader); //3
QueryParser parser = new QueryParser( // 4
"contents", //4
new StandardAnalyzer( )); //4
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10); //5
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("fullpath")); //8
}
//9
}
}
/*
#1 Parse provided index directory
#2 Parse provided query string
#3 Open index
#4 Parse query
#5 Search index
#6 Write search stats
#7 Retrieve matching document
#8 Display filename
#9 Close IndexSearcher
*/
Upvotes: 0
Reputation: 33341
Your biggest problem is how you are handling Exceptions. Here is the exception you should be seeing, if you weren't throwing them all away:
Exception in thread "main" java.lang.IllegalArgumentException: it doesn't make sense to have a field that is neither indexed nor stored
at org.apache.lucene.document.Field.(Field.java:249)
If you are trying to figure out what is wrong with your program, don't just toss out all the exceptions! Those are the tools to help you find and handle problems!
What that exception is telling you is that one of your fields ("content", specifically), is neither indexed nor stored. You should never use a FieldType
with all default values. You need to set it up with the appropriate values for the field. In this case, it might make sense to just use TextField
, instead.
PS - Lucene in Action 2nd ed. is 7 years old, and it's code is for Lucene 3.0. It's quite outdated, and you would probably be better served trying the current lucene demo.
Upvotes: 2