Closed
Description
Describe the bug
I am migrating from Solr to OpenSearch and need to maintain the same analyzer behavior that I had in Solr, where the word_delimiter_graph filter is applied before the synonym expansion.
In Solr, this order worked without issues, and I used FlattenGraphFilterFactory to handle token graph flattening after synonym processing. I need to maintain this specific order to keep consistent search behavior during the migration. Any guidance or suggestions would be greatly appreciated.
Solr schema:
</analyzer>
<analyzer type="query">
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="[({.,\[\]\“\”/})]" replacement=" " />
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict.txt" ignoreCase="true"/>
<!--Move ASCII folding, lowerCase, and Hunspell before synonyms so that clean singular terms are sent to the synonyms-->
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" preserveOriginal="0" catenateAll="1" splitOnCaseChange="0"/>
<filter class="solr.HunspellStemFilterFactory" dictionary="en-US.dic" affix="en-US.aff" ignoreCase="true" />
<filter class="solr.ManagedSynonymGraphFilterFactory" managed="english" />
<filter class="solr.PatternReplaceFilterFactory" pattern="(-)" replacement=" " replace="all" />
<filter class="solr.FlattenGraphFilterFactory" /> <!-- required on index analyzers after graph filters -->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
</analyzer>
</fieldType>
OpenSearch mapping, where I am facing the error for the analyzer "text_en_index" and "text_en_query":
PUT /my-index-10
{
"settings": {
"analysis": {
"char_filter": {
"custom_pattern_replace": {
"type": "pattern_replace",
"pattern": "[({.,\\[\\]“”/})]",
"replacement": " "
}
},
"filter": {
"custom_ascii_folding": {
"type": "asciifolding",
"preserve_original": true
},
"custom_word_delimiter": {
"type": "word_delimiter_graph",
"generate_word_parts": true,
"preserve_original": false,
"catenate_all": true,
"split_on_numerics": false,
"split_on_case_change": false
},
"custom_synonym_filter": {
"type": "synonym",
"synonyms_path": "analyzers/F198001551",
"updateable": true
},
"custom_hunspell_stemmer": {
"type": "hunspell",
"locale": "en_US"
},
"custom_pattern_replace_filter":{
"type": "pattern_replace",
"pattern": "(-)",
"replacement": " ",
"all":true
},
"custom_stemmer_override":{
"type": "keyword_marker",
"keywords_path":"analyzers/F225794029",
"ignore_case": true
},
"custom_synonym_graph_filter":{
"type": "synonym_graph",
"synonyms_path": "analyzers/F3495229"
}
},
"analyzer": {
"text_en_index": {
"type":"custom",
"char_filter": ["custom_pattern_replace"],
"tokenizer": "whitespace",
"filter": [
"custom_ascii_folding",
"lowercase",
"custom_word_delimiter",
"custom_hunspell_stemmer",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
]
},
"text_en_query": {
"type":"custom",
"char_filter": ["custom_pattern_replace"],
"tokenizer": "whitespace",
"filter": [
"custom_stemmer_override",
"custom_ascii_folding",
"lowercase",
"custom_word_delimiter",
"custom_hunspell_stemmer",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
]
},
"text_id_tx_class_id":{
"tokenizer": "whitespace"
},
"text_general_index_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase"
]
},
"text_general_query_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"custom_synonym_filter"
]
},
"en": {
"tokenizer": "standard",
"filter": [ "custom_hunspell_stemmer" ]
},
"managed_synonym_analyzer":{
"tokenizer": "standard",
"filter": [ "custom_synonym_graph_filter" ]
}
}
}
},
"mappings": {
"properties": {
"id": {
"type": "keyword",
"index": true,
"store": true
},
"id_tx":{
"type": "text",
"analyzer": "text_id_tx_class_id",
"store": true,
"index" :true
},
"description": {
"type": "text",
"analyzer": "text_en_index",
"search_analyzer": "text_en_query",
"store": true,
"index" :true
},
"TM5":{
"type":"keyword",
"store": true,
"index" :true
},
"status":{
"type":"keyword",
"store": true,
"index" :true
},
"version":{
"type":"keyword",
"store": true,
"index" :true
},
"class": {
"type": "text",
"analyzer": "text_general_index_analyzer",
"search_analyzer": "text_general_query_analyzer",
"store": true
},
"long class":{
"type": "text",
"analyzer": "text_general_index_analyzer",
"search_analyzer": "text_general_query_analyzer",
"store": true
},
"class_id":{
"type":"text",
"analyzer":"text_id_tx_class_id",
"index":true,
"store":true
},
"notes":{
"type": "text",
"analyzer": "text_en_index",
"search_analyzer": "text_en_query",
"store": true,
"index" :true
},
"employee_notes":{
"type": "text",
"analyzer": "text_en_index",
"search_analyzer": "text_en_query",
"store": true,
"index" :true
},
"editor_notes":{
"type": "text",
"analyzer": "text_en_index",
"search_analyzer": "text_en_query",
"store": true,
"index" :true
},
"begin_effective_date":{
"type":"date",
"store":true,
"index": true,
"format":"strict_date_optional_time||epoch_millis"
},
"end_effective_date":{
"type":"date",
"store":true,
"index": true,
"format":"strict_date_optional_time||epoch_millis"
},
"goods_services":{
"type":"keyword",
"store":true,
"index":true
},
"record_state":{
"type":"keyword",
"store":true,
"index":true
},
"action_flag":{
"type":"keyword",
"store":true,
"index":true
},
"creation_date":{
"type":"date",
"store":true,
"index": true,
"format":"strict_date_optional_time||epoch_millis"
},
"created_by":{
"type":"keyword",
"store":true,
"index":true
},
"last_updated_date":{
"type":"date",
"store":true,
"index": true,
"format":"strict_date_optional_time||epoch_millis"
},
"last_updated_by":{
"type":"keyword",
"store":true,
"index":true
}
}
}
}
In OpenSearch, when I try to apply the word_delimiter_graph filter before the synonym_graph filter (as required by my use case), I receive the following error:
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
}
],
"type": "illegal_argument_exception",
"reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
},
"status": 400
}
Question:
- Is there a known limitation in OpenSearch that prevents the word_delimiter_graph filter from being applied before the synonym_graph filter?
- Is there any recommended workaround or configuration that would allow me to maintain the same filter order while avoiding the token graph parsing error?
Related component
Other
To Reproduce
N/A
Expected behavior
N/A
Additional Details
Plugins
Please list all plugins currently enabled.
Screenshots
If applicable, add screenshots to help explain your problem.
Host/Environment (please complete the following information):
- OS: [e.g. iOS]
- Version [e.g. 22]
Additional context
Add any other context about the problem here.