Christopher Bruce
Christopher Bruce

Reputation: 701

Lucene query not returning hit on standard analyzer

I have a filename thatfeelwhen.pdf that when I search for using words like "that" or "feel", I don't get a hit, when I do if I type "when" or the entire filename. I'm using a standard analyzer. How can I get the searcher for Lucene to match everything? My search queries seem to be matching on the content within the file but not in the filename.

public partial class _Default : Page
{
    Directory finalDirectory = null;
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

Code below in other methods:

private static void AddTextToIndex(string filename, string pdfBody, IndexWriter writer)
    {
        Document doc = new Document();
        doc.Add(new Field("fileName", filename.ToString(), Field.Store.YES, Field.Index.ANALYZED));
        doc.Add(new Field("pdfBody", pdfBody.ToString(), Field.Store.NO, Field.Index.ANALYZED));
        writer.AddDocument(doc);
    }

    private static Directory buildIndex(Analyzer analyzer)
    {
        string[] syllabusFiles = System.IO.Directory.GetFiles(@"C:\mywebsite\files\forms");
        Directory directory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));           
        var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);

        int j = 0;
        while (j < syllabusFiles.Length)
        {
            string pdfTextExtracted = pdfText(syllabusFiles[j]);
            string fileNameOnly = syllabusFiles[j].Replace("C:\\website\\files\\forms", "");
            AddTextToIndex(fileNameOnly, pdfTextExtracted, writer);
            j++;
        }
        writer.Optimize();
        writer.Dispose();
        return directory;
    }

    protected void txtBoxSearchPDF_Click(object sender, EventArgs e)
    {
        if (txtBoxSearchString.Text == "")
        {
            lblNoSearchString.Visible = true;               
        }
        else if (txtBoxSearchString.Text == "build_index")
        {
            this.finalDirectory = buildIndex(this.analyzer);
        }
        else
        {
            //searching PDF text
            lblNoSearchString.Visible = false;
            StringBuilder sb = new StringBuilder();
            this.finalDirectory = FSDirectory.Open(new DirectoryInfo(@"C:\mywebsite\files\LuceneIndex"));
            IndexReader indexReader = IndexReader.Open(this.finalDirectory, true);
            Searcher indexSearch = new IndexSearcher(indexReader);
            string searchQuery = txtBoxSearchString.Text;
            var fields = new[] { "fileName", "pdfBody" };
            var queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fields, this.analyzer);
            Query query;
            try
            {
                query = queryParser.Parse(searchQuery.Trim());
            }
            catch (ParseException)
            {
                query = queryParser.Parse(QueryParser.Escape(searchQuery.Trim()));
            }
            TopDocs resultDocs = indexSearch.Search(query, indexReader.MaxDoc);                

            var hits = resultDocs.ScoreDocs;
            foreach (var hit in hits)
            {
                var documentFromSearcher = indexSearch.Doc(hit.Doc);
                string getResult = documentFromSearcher.Get("fileName");
                string formattedResult = getResult.Replace(" ", "%20");
                sb.AppendLine(@"<a href=https://website.com/search/forms/" + formattedResult+ ">" + getResult+"</a>");
                sb.AppendLine("<br>");
            }

Upvotes: 0

Views: 252

Answers (1)

Christopher Bruce
Christopher Bruce

Reputation: 701

I chose to use Analyzer analyzer = new SingleCharTokenAnalyzer(); and am getting much better results.

I tried Simple, Standard, Whitespace, and Keyword Analyzers and none were really suiting my needs without having to resort with creating extra work to customize them.

Upvotes: 1

Related Questions