Lucene Search with Date parameter

Question

I fairly new to Lucene framework. We are trying to implement Lucene framework since we need to search a LARGE amount of data within few milliseconds.

Scenario:

We have EmployeeDto which we have indexed in Lucene. For below example, I have hardcoded only 6 values.
I have 2 arguments which should act as input parameters to the search query.

 EmployeeDto.java
 private String firstName;
 private String lastName;
 private Long employeeId;
 private Integer salary;
 private Date startDate;
 private Date terminationDate;
 //getters and setters


 EmployeeLucene.java
 public class EmployeeLucene {

 public static void main(String[] args) throws IOException, ParseException {
     // 0. Specify the analyzer for tokenizing text.
     //    The same analyzer should be used for indexing and searching
     StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);

     final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");

     // 1. create the index
     Directory index = new RAMDirectory();
     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
     IndexWriter w = new IndexWriter(index, config);
     long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
     System.out.println("Data Loading started");

     addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
     addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));

     w.close();
     System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));


     // 2. query
     Query q = null;
     try {
         q = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse(args[0] + "*");
     } catch (org.apache.lucene.queryparser.classic.ParseException e) {
         e.printStackTrace();
     }

     // 3. search
     long starttime = Calendar.getInstance().getTimeInMillis();
     int hitsPerPage = 100;
     IndexReader reader = DirectoryReader.open(index);
     IndexSearcher searcher = new IndexSearcher(reader);
     TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
     searcher.search(q, collector);
     ScoreDoc[] hits = collector.topDocs().scoreDocs;

     // 4. display results
     System.out.println("Found " + hits.length + " hits.");
     List employeeDtoList = new ArrayList();
     for (int i = 0; i < hits.length; ++i) {
         int docId = hits[i].doc;
         Document d = searcher.doc(docId);
         employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
                 Integer.valueOf(d.get("salary"))));
     }

     System.out.println(employeeDtoList.size());
     System.out.println(employeeDtoList);
     System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");

 }

 private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException, ParseException {
     Document doc = new Document();

     doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Field.Store.YES));
     doc.add(new TextField("firstName", employeeDto.getFirstName(), Field.Store.YES));
     doc.add(new TextField("lastName", employeeDto.getLastName(), Field.Store.YES));
     doc.add(new LongField("employeeId", employeeDto.getEmployeeId(), Field.Store.YES));
     doc.add(new LongField("salary", employeeDto.getSalary(), Field.Store.YES));
     doc.add(new LongField("startDate", employeeDto.getStartDate().getTime(), Field.Store.YES));
     doc.add(new LongField("terminationDate", employeeDto.getTerminationDate().getTime(), Field.Store.YES));
     w.addDocument(doc);
 }

}

I run the program as "java EmployeeLucene thom 2014-05-05". 
I should get only 2 values. but getting 3 hits.

Questions:

How to include the 2nd param in the Query string? 2nd param should be greater than 'startDate' and lesser than 'terminationDate'
Can we include EmployeeDto itself inside the document to avoid creation of List of EmployeeDtos once we get the hits.

K.Nicholas · Accepted Answer

First, you're going to get three results because you have three records with a full name that contains the string "thom*". They are records 2, 4, and 6.

Second, Lucene version 4.0 is really old.

Finally, one way to query for a date between startDate and terminationDate is as follows:

 // 2. query
 BooleanQuery finalQuery = null;
 try {
    // final query
    finalQuery = new BooleanQuery();

    // thom* query
    Query fullName = new QueryParser(Version.LUCENE_40, "fullName", analyzer).parse("thom" + "*");
    finalQuery.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.

    // greaterStartDate query
    long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
    Query greaterStartDate = NumericRangeQuery.newLongRange("startDate", null, searchDate, true, true);
    finalQuery.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator

    // lessTerminationDate query
    Query lessTerminationDate = NumericRangeQuery.newLongRange("terminationDate", searchDate, null, false, false);
    finalQuery.add(lessTerminationDate, Occur.MUST); 

 } catch (org.apache.lucene.queryparser.classic.ParseException e) {
     e.printStackTrace();
 }

Can we include EmployeeDto itself inside the document to avoid creation of List of EmployeeDtos once we get the hits.

Not that I'm aware of.

EDIT: Version 7.0.1

     // 0. Specify the analyzer for tokenizing text.
     //    The same analyzer should be used for indexing and searching
     StandardAnalyzer analyzer = new StandardAnalyzer();

     final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");

     // 1. create the index
     Directory index = new RAMDirectory();
     IndexWriterConfig config = new IndexWriterConfig(analyzer);
     IndexWriter w = new IndexWriter(index, config);
     long starttimeOfLoad = Calendar.getInstance().getTimeInMillis();
     System.out.println("Data Loading started");

     addEmployee(w, new EmployeeDto("John", "Smith", new Long(101), 10000, DATE_FORMAT.parse("2010-05-05"), DATE_FORMAT.parse("2018-05-05")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 12000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2015-10-10")));
     addEmployee(w, new EmployeeDto("Franklin", "Robinson", new Long(102), 12000, DATE_FORMAT.parse("2011-04-04"), DATE_FORMAT.parse("2015-07-07")));
     addEmployee(w, new EmployeeDto("Thomas", "Boone", new Long(102), 12000, DATE_FORMAT.parse("2011-02-02"), DATE_FORMAT.parse("2015-03-10")));
     addEmployee(w, new EmployeeDto("John", "Smith", new Long(103), 13000, DATE_FORMAT.parse("2019-05-05"), DATE_FORMAT.parse("2099-12-31")));
     addEmployee(w, new EmployeeDto("Bill", "Thomas", new Long(102), 14000, DATE_FORMAT.parse("2011-06-06"), DATE_FORMAT.parse("2099-12-31")));

     w.close();
     System.out.println("Data Loaded. Completed in " + (Calendar.getInstance().getTimeInMillis() - starttimeOfLoad));

     // 2. query
     BooleanQuery finalQuery = null;
     try {
        // final query
        Builder builder = new BooleanQuery.Builder();

        // thom* query
        Query fullName = new QueryParser("fullName", analyzer).parse("thom" + "*");
        builder.add(fullName, Occur.MUST); // MUST implies that the keyword must occur.

        // greaterStartDate query
        long searchDate = DATE_FORMAT.parse("2014-05-05").getTime();
        Query greaterStartDate = LongPoint.newRangeQuery("startDatePoint", Long.MIN_VALUE, searchDate);
        builder.add(greaterStartDate, Occur.MUST); // Using all "MUST" occurs is equivalent to "AND" operator

        // lessTerminationDate query
        Query lessTerminationDate = LongPoint.newRangeQuery("terminationDatePoint", searchDate, Long.MAX_VALUE);
        builder.add(lessTerminationDate, Occur.MUST);
        finalQuery = builder.build();

     } catch (org.apache.lucene.queryparser.classic.ParseException e) {
         e.printStackTrace();
     }

     // 3. search
     long starttime = Calendar.getInstance().getTimeInMillis();
     int hitsPerPage = 100;
     IndexReader reader = DirectoryReader.open(index);
     IndexSearcher searcher = new IndexSearcher(reader);
     TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
     searcher.search(finalQuery, collector);
     ScoreDoc[] hits = collector.topDocs().scoreDocs;

     // 4. display results
     System.out.println("Found " + hits.length + " hits.");
     List employeeDtoList = new ArrayList();
     for (int i = 0; i < hits.length; ++i) {
         int docId = hits[i].doc;
         Document d = searcher.doc(docId);
         employeeDtoList.add(new EmployeeDto(d.get("firstName"), d.get("lastName"), Long.valueOf(d.get("employeeId")),
                 Integer.valueOf(d.get("salary"))));
     }

     System.out.println(employeeDtoList.size());
     System.out.println(employeeDtoList);
     System.out.println("Time taken:" + (Calendar.getInstance().getTimeInMillis() - starttime) + " ms");

 }

 private static void addEmployee(IndexWriter w, EmployeeDto employeeDto) throws IOException {
     Document doc = new Document();

     doc.add(new TextField("fullName", employeeDto.getFirstName() + " " + employeeDto.getLastName(), Store.YES));
     doc.add(new TextField("firstName", employeeDto.getFirstName(), Store.YES));
     doc.add(new TextField("lastName", employeeDto.getLastName(), Store.YES));
     doc.add(new StoredField("employeeId", employeeDto.getEmployeeId()));
     doc.add(new StoredField("salary", employeeDto.getSalary()));
     doc.add(new StoredField("startDate", employeeDto.getStartDate().getTime()));
     doc.add(new LongPoint("startDatePoint", employeeDto.getStartDate().getTime()));
     doc.add(new StoredField("terminationDate", employeeDto.getTerminationDate().getTime()));
     doc.add(new LongPoint("terminationDatePoint", employeeDto.getTerminationDate().getTime()));
     w.addDocument(doc);
 }

EDIT: The date fields are stored as both LongPoint and StoredField types. The LongPoint type can be used for the LongPoint.newRangeQuery but cannot be retrieved as a value later if you want to know what the date is. The StoredField type can be retrieved as a stored value but cannot be used for range queries. While this example does not retrieve the date fields the version 4 did have both functionalities. You could remove the StoredField dates if you don't plan on ever retrieving the values.

Lucene Search with Date parameter

Answers (1)

Related Questions