Reputation: 701
I have a filename thatfeelwhen.pdf
that when I search for using words like "that" or "feel", I don't get a hit, when I do if I type "when" or the entire filename. I'm using a standard analyzer. How can I get the searcher for Lucene to match everything? My search queries seem to be matching on the content within the file but not in the filename.
public partial class _Default : Page
{
Directory finalDirectory = null;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
Code below in other methods:
private static void AddTextToIndex(string filename, string pdfBody, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("fileName", filename.ToString(), Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("pdfBody", pdfBody.ToString(), Field.Store.NO, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
private static Directory buildIndex(Analyzer analyzer)
{
string[] syllabusFiles = System.IO.Directory.GetFiles(@"C:\mywebsite\files\forms");
Directory directory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));
var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
int j = 0;
while (j < syllabusFiles.Length)
{
string pdfTextExtracted = pdfText(syllabusFiles[j]);
string fileNameOnly = syllabusFiles[j].Replace("C:\\website\\files\\forms", "");
AddTextToIndex(fileNameOnly, pdfTextExtracted, writer);
j++;
}
writer.Optimize();
writer.Dispose();
return directory;
}
protected void txtBoxSearchPDF_Click(object sender, EventArgs e)
{
if (txtBoxSearchString.Text == "")
{
lblNoSearchString.Visible = true;
}
else if (txtBoxSearchString.Text == "build_index")
{
this.finalDirectory = buildIndex(this.analyzer);
}
else
{
//searching PDF text
lblNoSearchString.Visible = false;
StringBuilder sb = new StringBuilder();
this.finalDirectory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));
IndexReader indexReader = IndexReader.Open(this.finalDirectory, true);
Searcher indexSearch = new IndexSearcher(indexReader);
string searchQuery = txtBoxSearchString.Text;
var fields = new[] { "fileName", "pdfBody" };
var queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fields, this.analyzer);
Query query;
try
{
query = queryParser.Parse(searchQuery.Trim());
}
catch (ParseException)
{
query = queryParser.Parse(QueryParser.Escape(searchQuery.Trim()));
}
TopDocs resultDocs = indexSearch.Search(query, indexReader.MaxDoc);
var hits = resultDocs.ScoreDocs;
foreach (var hit in hits)
{
var documentFromSearcher = indexSearch.Doc(hit.Doc);
string getResult = documentFromSearcher.Get("fileName");
string formattedResult = getResult.Replace(" ", "%20");
sb.AppendLine(@"<a href=https://website.com/search/forms/" + formattedResult+ ">" + getResult+"</a>");
sb.AppendLine("<br>");
}
Upvotes: 0
Views: 252
Reputation: 701
I chose to use Analyzer analyzer = new SingleCharTokenAnalyzer();
and am getting much better results.
I tried Simple, Standard, Whitespace, and Keyword Analyzers and none were really suiting my needs without having to resort with creating extra work to customize them.
Upvotes: 1