Skip to content

Commit 575f774

Browse files
authored
Enhance semantic field to allow to enable/disable chunking. (#1337)
* Implement the query logic for the semantic field. Signed-off-by: Bo Zhang <[email protected]> * Enhance semantic field to allow to enable/disable chunking. Signed-off-by: Bo Zhang <[email protected]> --------- Signed-off-by: Bo Zhang <[email protected]>
1 parent 979a9fc commit 575f774

27 files changed

+550
-313
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
1010
- [Semantic Field] Add semantic mapping transformer. ([#1276](https://github.com/opensearch-project/neural-search/pull/1276))
1111
- [Semantic Field] Add semantic ingest processor. ([#1309](https://github.com/opensearch-project/neural-search/pull/1309))
1212
- [Semantic Field] Implement the query logic for the semantic field. ([#1315](https://github.com/opensearch-project/neural-search/pull/1315))
13+
- [Semantic Field] Enhance semantic field to allow to enable/disable chunking. ([#1337](https://github.com/opensearch-project/neural-search/pull/1337))
1314

1415
### Enhancements
1516
- [Performance Improvement] Add custom bulk scorer for hybrid query (2-3x faster) ([#1289](https://github.com/opensearch-project/neural-search/pull/1289))

src/main/java/org/opensearch/neuralsearch/constants/SemanticFieldConstants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,10 @@ public class SemanticFieldConstants {
3535
* Default suffix for semantic info field name. It will be used to construct the field name of the semantic info.
3636
*/
3737
public static final String DEFAULT_SEMANTIC_INFO_FIELD_NAME_SUFFIX = "_semantic_info";
38+
39+
/**
40+
* Name of the field to control if we should do chunking for the semantic field. By default, the chunking is
41+
* disabled to not downgrade the search performance.
42+
*/
43+
public static final String CHUNKING = "chunking";
3844
}

src/main/java/org/opensearch/neuralsearch/constants/SemanticInfoFieldConstants.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ public class SemanticInfoFieldConstants {
2323

2424
public static final String CHUNKS_FIELD_NAME = "chunks";
2525
public static final String CHUNKS_TEXT_FIELD_NAME = "text";
26-
public static final String CHUNKS_EMBEDDING_FIELD_NAME = "embedding";
26+
public static final String EMBEDDING_FIELD_NAME = "embedding";
2727

2828
public static final String MODEL_FIELD_NAME = "model";
2929
public static final String MODEL_ID_FIELD_NAME = "id";

src/main/java/org/opensearch/neuralsearch/mapper/SemanticFieldMapper.java

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
import java.util.Map;
3131
import java.util.Set;
3232

33+
import static org.opensearch.neuralsearch.constants.MappingConstants.PATH_SEPARATOR;
34+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.CHUNKING;
35+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.DEFAULT_SEMANTIC_INFO_FIELD_NAME_SUFFIX;
3336
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.MODEL_ID;
3437
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.RAW_FIELD_TYPE;
3538
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEARCH_MODEL_ID;
@@ -127,6 +130,14 @@ public static class Builder extends ParametrizedFieldMapper.Builder {
127130
null
128131
);
129132

133+
@Getter
134+
protected final Parameter<Boolean> chunkingEnabled = Parameter.boolParam(
135+
CHUNKING,
136+
false,
137+
m -> ((SemanticFieldMapper) m).semanticParameters.getChunkingEnabled(),
138+
false
139+
);
140+
130141
@Setter
131142
protected ParametrizedFieldMapper.Builder delegateBuilder;
132143

@@ -136,7 +147,7 @@ protected Builder(String name) {
136147

137148
@Override
138149
protected List<Parameter<?>> getParameters() {
139-
return List.of(modelId, searchModelId, rawFieldType, semanticInfoFieldName);
150+
return List.of(modelId, searchModelId, rawFieldType, semanticInfoFieldName, chunkingEnabled);
140151
}
141152

142153
@Override
@@ -157,12 +168,13 @@ public SemanticFieldMapper build(BuilderContext context) {
157168
}
158169

159170
public SemanticParameters getSemanticParameters() {
160-
return new SemanticParameters(
161-
modelId.getValue(),
162-
searchModelId.getValue(),
163-
rawFieldType.getValue(),
164-
semanticInfoFieldName.getValue()
165-
);
171+
return SemanticParameters.builder()
172+
.modelId(modelId.getValue())
173+
.searchModelId(searchModelId.getValue())
174+
.rawFieldType(rawFieldType.getValue())
175+
.semanticInfoFieldName(semanticInfoFieldName.getValue())
176+
.chunkingEnabled(chunkingEnabled.getValue())
177+
.build();
166178
}
167179
}
168180

@@ -249,6 +261,15 @@ public SemanticFieldType(@NonNull final MappedFieldType delegate, @NonNull final
249261
public String typeName() {
250262
return SemanticFieldMapper.CONTENT_TYPE;
251263
}
264+
265+
public String getSemanticInfoFieldPath() {
266+
final String[] paths = name().split("\\.");
267+
final String semanticInfoFieldName = semanticParameters.getSemanticInfoFieldName();
268+
paths[paths.length - 1] = semanticInfoFieldName == null
269+
? paths[paths.length - 1] + DEFAULT_SEMANTIC_INFO_FIELD_NAME_SUFFIX
270+
: semanticInfoFieldName;
271+
return String.join(PATH_SEPARATOR, paths);
272+
}
252273
}
253274

254275
@Override

src/main/java/org/opensearch/neuralsearch/mapper/dto/SemanticParameters.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,18 @@
44
*/
55
package org.opensearch.neuralsearch.mapper.dto;
66

7+
import lombok.Builder;
78
import lombok.Getter;
89

910
/**
1011
* A DTO to hold all the semantic parameters.
1112
*/
1213
@Getter
14+
@Builder
1315
public class SemanticParameters {
1416
private final String modelId;
1517
private final String searchModelId;
1618
private final String rawFieldType;
1719
private final String semanticInfoFieldName;
18-
19-
public SemanticParameters(String modelId, String searchModelId, String rawFieldType, String semanticInfoFieldName) {
20-
this.modelId = modelId;
21-
this.searchModelId = searchModelId;
22-
this.rawFieldType = rawFieldType;
23-
this.semanticInfoFieldName = semanticInfoFieldName;
24-
}
20+
private final Boolean chunkingEnabled;
2521
}

src/main/java/org/opensearch/neuralsearch/mappingtransformer/SemanticInfoConfigBuilder.java

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
package org.opensearch.neuralsearch.mappingtransformer;
66

77
import org.opensearch.core.xcontent.NamedXContentRegistry;
8-
import org.opensearch.index.mapper.MapperService;
98
import org.opensearch.index.mapper.ObjectMapper;
109
import org.opensearch.index.mapper.RankFeaturesFieldMapper;
1110
import org.opensearch.index.mapper.TextFieldMapper;
@@ -22,7 +21,7 @@
2221

2322
import static org.opensearch.neuralsearch.constants.MappingConstants.PROPERTIES;
2423
import static org.opensearch.neuralsearch.constants.MappingConstants.TYPE;
25-
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.CHUNKS_EMBEDDING_FIELD_NAME;
24+
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.EMBEDDING_FIELD_NAME;
2625
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.CHUNKS_FIELD_NAME;
2726
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.CHUNKS_TEXT_FIELD_NAME;
2827
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.DEFAULT_MODEL_CONFIG;
@@ -45,6 +44,7 @@ public class SemanticInfoConfigBuilder {
4544
private String spaceType;
4645
private String knnMethodName = KNN_VECTOR_METHOD_DEFAULT_NAME;
4746
private Integer embeddingDimension;
47+
private Boolean chunkingEnabled;
4848

4949
public SemanticInfoConfigBuilder(@NonNull final NamedXContentRegistry xContentRegistry) {
5050
this.xContentRegistry = xContentRegistry;
@@ -72,7 +72,7 @@ public SemanticInfoConfigBuilder(@NonNull final NamedXContentRegistry xContentRe
7272
* @return Config of the semantic info fields.
7373
*/
7474
public Map<String, Object> build() {
75-
Map<String, Object> embeddingFieldConfig = switch (embeddingFieldType) {
75+
final Map<String, Object> embeddingFieldConfig = switch (embeddingFieldType) {
7676
case KNNVectorFieldMapper.CONTENT_TYPE -> buildKnnFieldConfig();
7777
case RankFeaturesFieldMapper.CONTENT_TYPE -> buildRankFeaturesFieldConfig();
7878
default -> throw new IllegalArgumentException(
@@ -84,14 +84,17 @@ public Map<String, Object> build() {
8484
);
8585
};
8686

87-
final Map<String, Object> chunksConfig = Map.of(
88-
TYPE,
89-
ObjectMapper.NESTED_CONTENT_TYPE,
90-
PROPERTIES,
91-
Map.of(CHUNKS_TEXT_FIELD_NAME, Map.of(TYPE, TextFieldMapper.CONTENT_TYPE), CHUNKS_EMBEDDING_FIELD_NAME, embeddingFieldConfig)
92-
);
93-
94-
return Map.of(PROPERTIES, Map.of(CHUNKS_FIELD_NAME, chunksConfig, MODEL_FIELD_NAME, DEFAULT_MODEL_CONFIG));
87+
if (chunkingEnabled) {
88+
final Map<String, Object> chunksConfig = Map.of(
89+
TYPE,
90+
ObjectMapper.NESTED_CONTENT_TYPE,
91+
PROPERTIES,
92+
Map.of(CHUNKS_TEXT_FIELD_NAME, Map.of(TYPE, TextFieldMapper.CONTENT_TYPE), EMBEDDING_FIELD_NAME, embeddingFieldConfig)
93+
);
94+
return Map.of(PROPERTIES, Map.of(CHUNKS_FIELD_NAME, chunksConfig, MODEL_FIELD_NAME, DEFAULT_MODEL_CONFIG));
95+
} else {
96+
return Map.of(PROPERTIES, Map.of(EMBEDDING_FIELD_NAME, embeddingFieldConfig, MODEL_FIELD_NAME, DEFAULT_MODEL_CONFIG));
97+
}
9598
}
9699

97100
private Map<String, Object> buildKnnFieldConfig() {
@@ -158,26 +161,22 @@ private void extractInfoForTextEmbeddingModel(@NonNull final MLModel mlModel, @N
158161

159162
this.embeddingDimension = textEmbeddingModelConfig.getEmbeddingDimension();
160163

161-
final Map<String, Object> allConfigMap;
162-
try {
163-
allConfigMap = MapperService.parseMapping(xContentRegistry, textEmbeddingModelConfig.getAllConfig());
164-
} catch (Exception e) {
165-
throw new IllegalArgumentException(
166-
String.format(
167-
Locale.ROOT,
168-
"Failed to parse the all_config of the model %s. Invalid all_config: %s",
169-
modelId,
170-
textEmbeddingModelConfig.getAllConfig()
171-
)
172-
);
164+
final Map<String, Object> additionalConfig = textEmbeddingModelConfig.getAdditionalConfig();
165+
if (additionalConfig != null) {
166+
final Object spaceTypeObject = additionalConfig.get(KNN_VECTOR_METHOD_SPACE_TYPE_FIELD_NAME);
167+
if (spaceTypeObject instanceof String == false) {
168+
throw createInvalidSpaceTypeException(modelId);
169+
}
170+
this.spaceType = (String) spaceTypeObject;
171+
} else {
172+
throw createInvalidSpaceTypeException(modelId);
173173
}
174-
final Object spaceTypeObject = allConfigMap.get(KNN_VECTOR_METHOD_SPACE_TYPE_FIELD_NAME);
175-
if (spaceTypeObject instanceof String == false) {
176-
throw new IllegalArgumentException(
177-
String.format(Locale.ROOT, "space_type is not defined or not a string in the all_config of the model %s.", modelId)
178-
);
179-
}
180-
this.spaceType = (String) spaceTypeObject;
174+
}
175+
176+
private IllegalArgumentException createInvalidSpaceTypeException(@NonNull final String modelId) {
177+
return new IllegalArgumentException(
178+
String.format(Locale.ROOT, "space_type is not defined or not a string in the additional_config of the model %s.", modelId)
179+
);
181180
}
182181

183182
private void extractInfoForSparseModel() {
@@ -213,4 +212,9 @@ private String getUnsupportedRemoteModelError(final String modelType, @lombok.No
213212
String.join(",", SUPPORTED_REMOTE_MODEL_TYPES)
214213
);
215214
}
215+
216+
public SemanticInfoConfigBuilder chunkingEnabled(final Boolean chunkingEnabled) {
217+
this.chunkingEnabled = chunkingEnabled;
218+
return this;
219+
}
216220
}

src/main/java/org/opensearch/neuralsearch/mappingtransformer/SemanticMappingTransformer.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;
2828
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.collectSemanticField;
2929
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.extractModelIdToFieldPathMap;
30+
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.isChunkingEnabled;
3031
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.getProperties;
3132
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.validateModelId;
3233
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.validateSemanticInfoFieldName;
@@ -190,8 +191,8 @@ private void modifyMappings(
190191
final List<String> fieldPathList = modelIdToFieldPathMap.get(modelId);
191192
for (String fieldPath : fieldPathList) {
192193
try {
193-
final Map<String, Object> semanticInfoConfig = createSemanticInfoField(mlModel, modelId);
194194
final Map<String, Object> fieldConfig = semanticFieldPathToConfigMap.get(fieldPath);
195+
final Map<String, Object> semanticInfoConfig = createSemanticInfoField(mlModel, modelId, fieldConfig, fieldPath);
195196
setSemanticInfoField(mappings, fieldPath, fieldConfig.get(SEMANTIC_INFO_FIELD_NAME), semanticInfoConfig);
196197
} catch (IllegalArgumentException e) {
197198
throw new IllegalArgumentException(getModifyMappingErrorMessage(fieldPath, e.getMessage()), e);
@@ -207,9 +208,16 @@ private String getModifyMappingErrorMessage(@NonNull final String fieldPath, fin
207208
}
208209

209210
@VisibleForTesting
210-
private Map<String, Object> createSemanticInfoField(final @NonNull MLModel modelConfig, String modelId) {
211-
SemanticInfoConfigBuilder builder = new SemanticInfoConfigBuilder(xContentRegistry);
212-
return builder.mlModel(modelConfig, modelId).build();
211+
private Map<String, Object> createSemanticInfoField(
212+
final @NonNull MLModel modelConfig,
213+
final String modelId,
214+
@NonNull final Map<String, Object> fieldConfig,
215+
String fieldPath
216+
) {
217+
final SemanticInfoConfigBuilder builder = new SemanticInfoConfigBuilder(xContentRegistry);
218+
builder.mlModel(modelConfig, modelId);
219+
builder.chunkingEnabled(isChunkingEnabled(fieldConfig, fieldPath));
220+
return builder.build();
213221
}
214222

215223
@SuppressWarnings("unchecked")

src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT;
3434
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT;
3535
import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD;
36-
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseInteger;
3736
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerWithDefault;
3837

3938
/**
@@ -273,31 +272,11 @@ private void chunkMapType(
273272
}
274273
}
275274

276-
/**
277-
* Chunk the content, update the runtime max_chunk_limit and return the result
278-
*/
279-
private List<String> chunkString(final String content, final Map<String, Object> runTimeParameters) {
280-
// return an empty list for empty string
281-
if (StringUtils.isEmpty(content)) {
282-
return List.of();
283-
}
284-
List<String> contentResult = chunker.chunk(content, runTimeParameters);
285-
// update chunk_string_count for each string
286-
int chunkStringCount = parseInteger(runTimeParameters, CHUNK_STRING_COUNT_FIELD);
287-
runTimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount - 1);
288-
// update runtime max_chunk_limit if not disabled
289-
int runtimeMaxChunkLimit = parseInteger(runTimeParameters, MAX_CHUNK_LIMIT_FIELD);
290-
if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
291-
runTimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit - contentResult.size());
292-
}
293-
return contentResult;
294-
}
295-
296275
private List<String> chunkList(final List<String> contentList, final Map<String, Object> runTimeParameters) {
297276
// flatten original output format from List<List<String>> to List<String>
298277
List<String> result = new ArrayList<>();
299278
for (String content : contentList) {
300-
result.addAll(chunkString(content, runTimeParameters));
279+
result.addAll(chunker.chunkString(content, runTimeParameters));
301280
}
302281
return result;
303282
}
@@ -314,7 +293,7 @@ private List<String> chunkLeafType(final Object value, final Map<String, Object>
314293
if (StringUtils.isBlank(String.valueOf(value))) {
315294
return result;
316295
}
317-
result = chunkString(value.toString(), runTimeParameters);
296+
result = chunker.chunkString(value.toString(), runTimeParameters);
318297
} else if (isListOfString(value)) {
319298
result = chunkList((List<String>) value, runTimeParameters);
320299
}

0 commit comments

Comments
 (0)