Reputation:
I'm building a search engine using Haystack, and one of the features I'm working on is allowing people to filter by version field, described like so:
version = indexes.CharField(model_attr="version")
Versions are short strings and aren't constrained to semantic "versions" that follow the "x.y.z" style and may just be as simple as "1".
Unfortunately, after some experimenting it looks like Haystack ignores filters with shorter than 3 characters. So this:
SearchQuerySet().filter(version="1")
will actually return nothing, while this:
SearchQuerySet().filter(content="foo").filter(version="1")
will return everything that matches the first filter.
After some experimentation, I've found that its based on string length, not on it being a number field. So all of these behave thesame:
SearchQuerySet().filter(version="1")
SearchQuerySet().filter(version="a")
SearchQuerySet().filter(version="1a")
What will work is these (if an item has a version
set to "100"
):
SearchQuerySet().filter(version=100)
SearchQuerySet().filter(version="100")
Now obviously, I don't want every field to have this level of granularity, but is there anyway to state that for a particular field, I want filtering to work even on a single character?
Upvotes: 1
Views: 894
Reputation: 1835
Building on top of Aamir Adnan's answer, here is a version just calling the parent and overwriting string field definitions. You could also use this to overwrite based on field name.
from haystack.backends.whoosh_backend import WhooshEngine, WhooshSearchBackend
from haystack.backends.whoosh_backend import TEXT
from whoosh.analysis import StemmingAnalyzer
class CustomSearchBackend(WhooshSearchBackend):
def build_schema(self, fields):
content_field_name, schema = super(CustomSearchBackend, self).build_schema(fields)
for field_name, field_class in fields.items():
if field_class.field_type == 'string':
schema.remove(field_class.index_fieldname)
schema.add(field_class.index_fieldname,
TEXT(stored=True,
analyzer=StemmingAnalyzer(minsize=1),
field_boost=field_class.boost)
)
return content_field_name, schema
class CustomWhooshEngine(WhooshEngine):
backend = CustomSearchBackend
Upvotes: 1
Reputation: 39659
I am giving my answer by considering the backend whoosh
. But this can be apply to other backends by studying their rules.
django-haystack use StemmingAnalyzer imported from whoosh.analysis.StemmingAnalyzer which is used for Text (char) field in method build_schema of WhooshSearchBackend. From whoosh.analysis.StemmingAnalyzer you can see it takes minsize
parameter which is by default set to 2
so that is why you can not filter on one character. We need to override the build_schema
method in WhooshSearchBackend
and to set minszie
parameter to 1
for StemmingAnalyzer
:
Place this code in search_backends.py:
from haystack.backends.whoosh_backend import WhooshEngine, WhooshSearchBackend, WHOOSH_ID, ID, DJANGO_CT, DJANGO_ID, Schema, IDLIST, TEXT, KEYWORD, NUMERIC, BOOLEAN, DATETIME, NGRAM, NGRAMWORDS
from whoosh.analysis import StemmingAnalyzer
class CustomSearchBackend(WhooshSearchBackend):
def build_schema(self, fields):
schema_fields = {
ID: WHOOSH_ID(stored=True, unique=True),
DJANGO_CT: WHOOSH_ID(stored=True),
DJANGO_ID: WHOOSH_ID(stored=True),
}
# Grab the number of keys that are hard-coded into Haystack.
# We'll use this to (possibly) fail slightly more gracefully later.
initial_key_count = len(schema_fields)
content_field_name = ''
for field_name, field_class in fields.items():
if field_class.is_multivalued:
if field_class.indexed is False:
schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
else:
schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
elif field_class.field_type in ['date', 'datetime']:
schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
elif field_class.field_type == 'integer':
schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
elif field_class.field_type == 'float':
schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
elif field_class.field_type == 'boolean':
# Field boost isn't supported on BOOLEAN as of 1.8.2.
schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
elif field_class.field_type == 'ngram':
schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
elif field_class.field_type == 'edge_ngram':
schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost)
else:
schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1), field_boost=field_class.boost)
if field_class.document is True:
content_field_name = field_class.index_fieldname
# Fail more gracefully than relying on the backend to die if no fields
# are found.
if len(schema_fields) <= initial_key_count:
raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
return (content_field_name, Schema(**schema_fields))
class CustomWhooshEngine(WhooshEngine):
backend = CustomSearchBackend
Now we need to tell haystack to use our CustomSearchBackend
:
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'search_backends.CustomWhooshEngine',
'PATH': os.path.join(os.path.dirname(__file__), 'whoosh_index'),
},
}
After doing this run command rebuild_index
and update_index
and you should be able to filter on single character except letter a
because letter a
is also in STOP_WORDS if you want to allow single character a
also, you need to pass yours STOP_WORDS by removing letter a
like this in build_schema
:
from whoosh.analysis import STOP_WORDS
STOP_WORDS = frozenset([el for el in STOP_WORDS if len(el) > 1]) # remove all single letter stop words
class CustomSearchBackend(WhooshSearchBackend):
def build_schema(self, fields):
# rest of code
# ------
else:
schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1, stoplist=STOP_WORDS), field_boost=field_class.boost)
Note: The build_schema
code may vary based on haystack version. The above code is tested with whoosh=2.4
and haystack==2.0.0
Upvotes: 4