user61034
user61034

Reputation: 278

Custom analyzer for Elasticsearch

I am trying to use Sudachi plugin in my Elasticsearch using python. I have verified that the plugin has bee installed and it is working fine using JSON directly, and the paths exist. However, I am not able to integrate it into my python code. I suspect I have the tokenizer and/or analyzer wrong, but the documentation is not very clear to me. Any help in overcoming the error is welcome, Thanks!

    from elasticsearch import Elasticsearch
    from elasticsearch_dsl.connections import connections
    from elasticsearch_dsl import analyzer, tokenizer, Document, Text

    INDEX_NAME = 'my_text_index'

    class DocumentObject(Document):
        body = Text(analyzer='sudachi_analyzer')
        class Index:
            name = INDEX_NAME


    host = 'localhost'
    port = 9200
    connection = connections.create_connection(hosts=[{'host':host,'port':port}])
    es = Elasticsearch(hosts=[{'host':host,'port':port}]) 
    try:
        es.indices.close(index=INDEX_NAME)
    except:
        pass   

    sudachi_tokenizer = tokenizer(
            "sudachi_tokenizer",
            type="sudachi_tokenizer",
            mode="normal",
            discard_punctuation="true",
            resources_path="/Users/.../sudachi",
            settings_path="/Users/.../sudachi/sudachi.json"
        )
    sudachi_analyzer = analyzer(
        "sudachi_analyzer",
        tokenizer=sudachi_tokenizer,
        filter=[
                "sudachi_part_of_speech",
                "sudachi_ja_stop",
                "sudachi_normalizedform"
            ],
        type="custom",
        char_filter= []
        )

    DocumentObject.init()

This is the error I get

elasticsearch.exceptions.RequestError: RequestError(400, 'mapper_parsing_exception', 'analyzer [sudachi_analyzer] not found for field [body]')

For reference this the the JSON for the Sudachi analyzer which is working:

   "index": {
      "analysis": {
        "tokenizer": {
          "sudachi_tokenizer": {
            "type": "sudachi_tokenizer",
            "mode": "normal",
            "discard_punctuation": true,
            "resources_path": "/Users/.../sudachi",
            "settings_path": "/Users/.../sudachi/sudachi.json"
          }
        },
        "analyzer": {
          "sudachi_analyzer": {
            "tokenizer": "sudachi_tokenizer",
            "type": "custom",
            "char_filter": [],
            "filter": [
              "sudachi_part_of_speech",
              "sudachi_ja_stop",
              "sudachi_normalizedform"
            ]
          }
        }
      }
    }

Upvotes: 0

Views: 1450

Answers (1)

Val
Val

Reputation: 217274

Do it like this, you need to reference your custom analyzer instance, not its name:

from elasticsearch import Elasticsearch
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import analyzer, tokenizer, Document, Text

INDEX_NAME = 'my_text_index'

sudachi_tokenizer = tokenizer(
        "sudachi_tokenizer",
        type="sudachi_tokenizer",
        mode="normal",
        discard_punctuation="true",
        resources_path="/Users/.../sudachi",
        settings_path="/Users/.../sudachi/sudachi.json"
    )
sudachi_analyzer = analyzer(
    "sudachi_analyzer",
    tokenizer=sudachi_tokenizer,
    filter=[
            "sudachi_part_of_speech",
            "sudachi_ja_stop",
            "sudachi_normalizedform"
        ],
    type="custom",
    char_filter= []
    )

class DocumentObject(Document):
    body = Text(analyzer=sudachi_analyzer)    <---- remove the ticks, not a string, but the instance
    class Index:
        name = INDEX_NAME


host = 'localhost'
port = 9200
connection = connections.create_connection(hosts=[{'host':host,'port':port}])
es = Elasticsearch(hosts=[{'host':host,'port':port}]) 


DocumentObject.init()

Upvotes: 1

Related Questions