Skip to content

[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

Closed
@aswad1

Description

@aswad1

Describe the bug

I am migrating from Solr to OpenSearch and need to maintain the same analyzer behavior that I had in Solr, where the word_delimiter_graph filter is applied before the synonym expansion.

In Solr, this order worked without issues, and I used FlattenGraphFilterFactory to handle token graph flattening after synonym processing. I need to maintain this specific order to keep consistent search behavior during the migration. Any guidance or suggestions would be greatly appreciated.

Solr schema:

     </analyzer>
     <analyzer type="query">
        <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="[({.,\[\]\“\”/})]" replacement=" " />
        <tokenizer class="solr.WhitespaceTokenizerFactory" />
		<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict.txt" ignoreCase="true"/>
        <!--Move ASCII folding, lowerCase, and Hunspell before synonyms so that clean singular terms are sent to the synonyms-->
     <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" />
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" preserveOriginal="0" catenateAll="1" splitOnCaseChange="0"/>
        <filter class="solr.HunspellStemFilterFactory" dictionary="en-US.dic" affix="en-US.aff" ignoreCase="true" />
		<filter class="solr.ManagedSynonymGraphFilterFactory" managed="english" />
        <filter class="solr.PatternReplaceFilterFactory" pattern="(-)" replacement=" " replace="all" />
        <filter class="solr.FlattenGraphFilterFactory" /> <!-- required on index analyzers after graph filters -->
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
     </analyzer>
  </fieldType>

OpenSearch mapping, where I am facing the error for the analyzer "text_en_index" and "text_en_query":

PUT /my-index-10
{
  "settings": {
    "analysis": {
      "char_filter": {
        "custom_pattern_replace": {
          "type": "pattern_replace",
          "pattern": "[({.,\\[\\]“”/})]",
          "replacement": " "
        }
      },
      "filter": {
        "custom_ascii_folding": {
          "type": "asciifolding",
          "preserve_original": true
        },
        "custom_word_delimiter": {
          "type": "word_delimiter_graph",
          "generate_word_parts": true,
          "preserve_original": false,
          "catenate_all": true,
          "split_on_numerics": false,
          "split_on_case_change": false
        },
          "custom_synonym_filter": {
            "type": "synonym",
            "synonyms_path": "analyzers/F198001551",
            "updateable": true
          },
         "custom_hunspell_stemmer": {
          "type": "hunspell",
          "locale": "en_US"
        },
        "custom_pattern_replace_filter":{
          "type": "pattern_replace",
          "pattern": "(-)",
          "replacement": " ",
          "all":true
        },
        "custom_stemmer_override":{
          "type": "keyword_marker",
          "keywords_path":"analyzers/F225794029",
          "ignore_case": true
        },
		    "custom_synonym_graph_filter":{
			   "type": "synonym_graph",
         "synonyms_path": "analyzers/F3495229"
		  }
      },
      "analyzer": {
        "text_en_index": {
          "type":"custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_hunspell_stemmer",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_en_query": {
          "type":"custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_stemmer_override",
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_hunspell_stemmer",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_id_tx_class_id":{
          "tokenizer": "whitespace"
        },
         "text_general_index_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        },
        "text_general_query_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "custom_synonym_filter"
          ]
        },
        "en": {
          "tokenizer": "standard",
          "filter": [ "custom_hunspell_stemmer" ]
        },
		    "managed_synonym_analyzer":{
		      "tokenizer": "standard",
          "filter": [ "custom_synonym_graph_filter" ]
		}
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "keyword",
        "index": true,
        "store": true
      },
      "id_tx":{
        "type": "text",
        "analyzer": "text_id_tx_class_id",
        "store": true,
        "index" :true
      },
      "description": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "TM5":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
      "status":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
       "version":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
      "class": {
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "long class":{
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "class_id":{
        "type":"text",
        "analyzer":"text_id_tx_class_id",
        "index":true,
        "store":true
      },
      "notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
       "employee_notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "editor_notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "begin_effective_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "end_effective_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "goods_services":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "record_state":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "action_flag":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "creation_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "created_by":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "last_updated_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "last_updated_by":{
        "type":"keyword",
        "store":true,
        "index":true
      }
    }
  }
}

In OpenSearch, when I try to apply the word_delimiter_graph filter before the synonym_graph filter (as required by my use case), I receive the following error:

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
  },
  "status": 400
}

Question:

  1. Is there a known limitation in OpenSearch that prevents the word_delimiter_graph filter from being applied before the synonym_graph filter?
  2. Is there any recommended workaround or configuration that would allow me to maintain the same filter order while avoiding the token graph parsing error?

Related component

Other

To Reproduce

N/A

Expected behavior

N/A

Additional Details

Plugins
Please list all plugins currently enabled.

Screenshots
If applicable, add screenshots to help explain your problem.

Host/Environment (please complete the following information):

  • OS: [e.g. iOS]
  • Version [e.g. 22]

Additional context
Add any other context about the problem here.

Metadata

Metadata

Labels

OtherbugSomething isn't workingv2.19.0Issues and PRs related to version 2.19.0

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions