Reputation: 7338
I have some search indexes on my document with a Dutch analyzer, and it works very well. For example, consider:
http://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burgerlijke
When I try to make my searches fuzzy, it all goes wrong:
http://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burgerlijke~
Return 0 results all of a sudden. How can this be?
Design document:
{"_id": "_design/RegelingInfo",
"_rev": "11-20993b8c49d8bcc1cd4fde58e5f40b27",
"views": {
"all": {
"map": "function(doc) { \n if (doc._id.lastIndexOf('BWB', 0) === 0 ){\n emit( null, doc._id )\n }\n}"
}
},
"lists": {},
"shows": {},
"language": "javascript", "filters": {}, "updates": {}, "indexes": {
"regeling": {
"analyzer": {
"name": "dutch",
"stopwords": ["wet", "regeling", "besluit"]
},
"index": "function(doc) {\n var globalString = new Array();\n index(\"displayTitle\", doc.displayTitle, {\"store\": \"yes\"});\n globalString.push(doc.displayTitle);\n /*index(\"officieleTitel\", doc.officieleTitel, {\"store\": \"no\"});*/\n globalString.push(doc.officieleTitel);\n /*index(\"bwbid\", doc._id);*/\n globalString.push(doc._id);\n index(\"regelingSoort\", doc.regelingSoort, {\"store\": \"no\"});\n if (doc.citeertitels) {\n for (var i = 0; i < doc.citeertitels.length; i++) {\n /*index(\"citeertitel\", doc.citeertitels[i].titel, {\"store\": \"no\"});*/\n globalString.push(doc.citeertitels[i].titel);\n }\n }\n if (doc.afkortingen) {\n for (var i = 0; i < doc.afkortingen.length; i++) {\n /*index(\"afkorting\", doc.afkortingen[i], {\"store\": \"no\"});*/\n globalString.push(doc.afkortingen[i]);\n }\n }\n if (doc.nietOfficieleTitels) {\n for (var i = 0; i < doc.nietOfficieleTitels.length; i++) {\n /*index(\"nietOfficieleTitel\", doc.nietOfficieleTitels[i], {\"store\": \"no\"});*/\n globalString.push(doc.nietOfficieleTitels[i]);\n }\n }\n if (doc.xml) {\n /* Remove tags to get inner text*/\n index(\"innerText\", doc.xml.replace(/<[^>]*>/g, \"\"), {\"store\": \"no\"});\n }\n index(\"default\", globalString.join(\" \"), {\"store\": \"no\"});\n}"
}
}}
Formatted indexing function:
function(doc) {
var globalString = new Array();
index("displayTitle", doc.displayTitle, {"store": "yes"});
globalString.push(doc.displayTitle);
/*index("officieleTitel", doc.officieleTitel, {"store": "no"});*/
globalString.push(doc.officieleTitel);
/*index("bwbid", doc._id);*/
globalString.push(doc._id);
index("regelingSoort", doc.regelingSoort, {"store": "no"});
if (doc.citeertitels) {
for (var i = 0; i < doc.citeertitels.length; i++) {
/*index("citeertitel", doc.citeertitels[i].titel, {"store": "no"});*/
globalString.push(doc.citeertitels[i].titel);
}
}
if (doc.afkortingen) {
for (var i = 0; i < doc.afkortingen.length; i++) {
/*index("afkorting", doc.afkortingen[i], {"store": "no"});*/
globalString.push(doc.afkortingen[i]);
}
}
if (doc.nietOfficieleTitels) {
for (var i = 0; i < doc.nietOfficieleTitels.length; i++) {
/*index("nietOfficieleTitel", doc.nietOfficieleTitels[i], {"store": "no"});*/
globalString.push(doc.nietOfficieleTitels[i]);
}
}
if (doc.xml) {
/* Remove tags to get inner text*/
index("innerText", doc.xml.replace(/<[^>]*>/g, ""), {"store": "no"});
}
index("default", globalString.join(" "), {"store": "no"});
}
Upvotes: 0
Views: 620
Reputation: 4631
You can see what the analyzer does;
curl 'http://wetten.cloudant.com/_search_analyze -d '{"analyzer":"dutch","text":"burgerlijke"}'
which returns;
{"tokens":["burger"]}
This query;
curl 'https://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burger~'
returns 575 rows.
That's kinda awkward, though, it should be done for you. We'll look into it.
Upvotes: 2
Reputation: 136
Yep, it's all about analyzing. Here is a useful (yet undocumented) API endpoint to help debug these things. Substitute your own username/credentials, but then it's just:
curl 'https://malortmike.cloudant.com/_search_analyze?analyzer=dutch&text="burgerlijke"'
{"tokens":["burger"]}
curl -u 'malortmike:secret' 'https://malortmike.cloudant.com/_search_analyze?analyzer=standard&text="burgerlijke"'
{"tokens":["burgerlijke"]}
Fun to see the different analyzers at work.
Upvotes: 1
Reputation: 33351
I know exactly nothing about the Dutch language, but I strongly suspect that the problem is stemming.
The DutchAnalyzer
, as with most language specific analyzers, includes a stemmer, in order to match alternate forms of words with the same root word (ie. stem). However, wildcard, fuzzy, regex, etc. queries are not analyzed. TermQueries are.
So, if burgerlijke is getting stemmed significantly in the index (seems likely enough, not being familiar with the language), it's quite possible that the edit distance between the the stemmed vesion and the unstemmed version is simply too great to see a match. If the stemmed term in the index were, say "burger", that an edit distance of 5 from the fuzzy query term "burgerlijke", which is too far to get a result.
In general, stemmers don't play nice with any MultiTermQuery
.
Upvotes: 1