Skip to content

Commit 9ebe82b

Browse files
hasnain2808evankielley
authored andcommitted
Adding access to noSubMatches and noOverlappingMatches in Hyphenation… (opensearch-project#13895)
* Adding access to noSubMatches and noOverlappingMatches in HyphenationCompoundWordTokenFilter Signed-off-by: Evan Kielley <[email protected]> * Add Changelog Entry Signed-off-by: Mohammad Hasnain Mohsin Rajan <[email protected]> * test: add hyphenation decompounder tests Signed-off-by: Mohammad Hasnain <[email protected]> * test: refactor tests Signed-off-by: Mohammad Hasnain <[email protected]> * test: reformat test files Signed-off-by: Mohammad Hasnain <[email protected]> * chore: add changelog entry for 2.X Signed-off-by: Mohammad Hasnain <[email protected]> * chore: remove 3.x changelog Signed-off-by: Mohammad Hasnain <[email protected]> * chore: commonify settingsarr Signed-off-by: Mohammad Hasnain <[email protected]> * chore: commonify settingsarr Signed-off-by: Mohammad Hasnain <[email protected]> * chore: linting Signed-off-by: Mohammad Hasnain <[email protected]> --------- Signed-off-by: Evan Kielley <[email protected]> Signed-off-by: Mohammad Hasnain Mohsin Rajan <[email protected]> Signed-off-by: Mohammad Hasnain <[email protected]> Co-authored-by: Evan Kielley <[email protected]>
1 parent aba4caa commit 9ebe82b

File tree

6 files changed

+1313
-16
lines changed

6 files changed

+1313
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2121
- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
2222
- Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774))
2323
- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
24+
- Adding access to noSubMatches and noOverlappingMatches in Hyphenation ([#13895](https://github.com/opensearch-project/OpenSearch/pull/13895))
2425

2526
### Dependencies
2627
- Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))

modules/analysis-common/src/main/java/org/opensearch/analysis/common/HyphenationCompoundWordTokenFilterFactory.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,16 @@
5454
*/
5555
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
5656

57+
private final boolean noSubMatches;
58+
private final boolean noOverlappingMatches;
5759
private final HyphenationTree hyphenationTree;
5860

5961
HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
6062
super(indexSettings, env, name, settings);
6163

64+
noSubMatches = settings.getAsBoolean("no_sub_matches", false);
65+
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
66+
6267
String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
6368
if (hyphenationPatternsPath == null) {
6469
throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
@@ -85,7 +90,9 @@ public TokenStream create(TokenStream tokenStream) {
8590
minWordSize,
8691
minSubwordSize,
8792
maxSubwordSize,
88-
onlyLongestMatch
93+
onlyLongestMatch,
94+
noSubMatches,
95+
noOverlappingMatches
8996
);
9097
}
9198
}

modules/analysis-common/src/test/java/org/opensearch/analysis/common/CompoundAnalysisTests.java

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,12 @@
5050
import org.opensearch.test.IndexSettingsModule;
5151
import org.opensearch.test.OpenSearchTestCase;
5252
import org.hamcrest.MatcherAssert;
53+
import org.junit.Before;
5354

5455
import java.io.IOException;
56+
import java.io.InputStream;
57+
import java.nio.file.Files;
58+
import java.nio.file.Path;
5559
import java.util.ArrayList;
5660
import java.util.Arrays;
5761
import java.util.List;
@@ -63,17 +67,27 @@
6367
import static org.hamcrest.Matchers.instanceOf;
6468

6569
public class CompoundAnalysisTests extends OpenSearchTestCase {
70+
71+
Settings[] settingsArr;
72+
73+
@Before
74+
public void initialize() throws IOException {
75+
final Path home = createTempDir();
76+
copyHyphenationPatternsFile(home);
77+
this.settingsArr = new Settings[] { getJsonSettings(home), getYamlSettings(home) };
78+
}
79+
6680
public void testDefaultsCompoundAnalysis() throws Exception {
67-
Settings settings = getJsonSettings();
68-
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
69-
AnalysisModule analysisModule = createAnalysisModule(settings);
70-
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
71-
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
81+
for (Settings settings : this.settingsArr) {
82+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
83+
AnalysisModule analysisModule = createAnalysisModule(settings);
84+
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
85+
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
86+
}
7287
}
7388

7489
public void testDictionaryDecompounder() throws Exception {
75-
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
76-
for (Settings settings : settingsArr) {
90+
for (Settings settings : this.settingsArr) {
7791
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
7892
MatcherAssert.assertThat(terms.size(), equalTo(8));
7993
MatcherAssert.assertThat(
@@ -83,6 +97,26 @@ public void testDictionaryDecompounder() throws Exception {
8397
}
8498
}
8599

100+
// Hyphenation Decompounder tests mimic the behavior of lucene tests
101+
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
102+
public void testHyphenationDecompounder() throws Exception {
103+
for (Settings settings : this.settingsArr) {
104+
List<String> terms = analyze(settings, "hyphenationAnalyzer", "min veninde som er lidt af en læsehest");
105+
MatcherAssert.assertThat(terms.size(), equalTo(10));
106+
MatcherAssert.assertThat(terms, hasItems("min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest"));
107+
}
108+
}
109+
110+
// Hyphenation Decompounder tests mimic the behavior of lucene tests
111+
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
112+
public void testHyphenationDecompounderNoSubMatches() throws Exception {
113+
for (Settings settings : this.settingsArr) {
114+
List<String> terms = analyze(settings, "hyphenationAnalyzerNoSubMatches", "basketballkurv");
115+
MatcherAssert.assertThat(terms.size(), equalTo(3));
116+
MatcherAssert.assertThat(terms, hasItems("basketballkurv", "basketball", "kurv"));
117+
}
118+
}
119+
86120
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
87121
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
88122
AnalysisModule analysisModule = createAnalysisModule(settings);
@@ -111,21 +145,28 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
111145
}));
112146
}
113147

114-
private Settings getJsonSettings() throws IOException {
148+
private void copyHyphenationPatternsFile(Path home) throws IOException {
149+
InputStream hyphenation_patterns_path = getClass().getResourceAsStream("da_UTF8.xml");
150+
Path config = home.resolve("config");
151+
Files.createDirectory(config);
152+
Files.copy(hyphenation_patterns_path, config.resolve("da_UTF8.xml"));
153+
}
154+
155+
private Settings getJsonSettings(Path home) throws IOException {
115156
String json = "/org/opensearch/analysis/common/test1.json";
116157
return Settings.builder()
117158
.loadFromStream(json, getClass().getResourceAsStream(json), false)
118159
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
119-
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
160+
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
120161
.build();
121162
}
122163

123-
private Settings getYamlSettings() throws IOException {
164+
private Settings getYamlSettings(Path home) throws IOException {
124165
String yaml = "/org/opensearch/analysis/common/test1.yml";
125166
return Settings.builder()
126167
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
127168
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
128-
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
169+
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
129170
.build();
130171
}
131172
}

0 commit comments

Comments
 (0)