opensearch-project · msfroh · Jun 3, 2024 · May 7, 2024 · May 7, 2024 · May 24, 2024
@@ -8,6 +8,7 @@
 
 package org.opensearch.index.mapper;
 
+import org.opensearch.Version;
 import org.opensearch.common.annotation.PublicApi;
 import org.opensearch.core.common.io.stream.StreamInput;
 import org.opensearch.core.common.io.stream.StreamOutput;
@@ -18,17 +19,21 @@
 import org.opensearch.script.Script;
 
 import java.io.IOException;
+import java.util.Map;
 import java.util.Objects;
 
 /**
  * DerivedField representation: expects a name, type and script.
  */
 @PublicApi(since = "2.14.0")
 public class DerivedField implements Writeable, ToXContentFragment {
-
     private final String name;
     private final String type;
     private final Script script;
+    private String sourceIndexedField;
+    private Map<String, Object> properties;
+    private Boolean ignoreMalformed;
+    private String format;
 
     public DerivedField(String name, String type, Script script) {
         this.name = name;
@@ -40,20 +45,51 @@ public DerivedField(StreamInput in) throws IOException {
         name = in.readString();
         type = in.readString();
         script = new Script(in);
+        if (in.getVersion().onOrAfter(Version.V_2_15_0)) {
+            if (in.readBoolean()) {
+                properties = in.readMap();
+            }
+            sourceIndexedField = in.readOptionalString();
+            format = in.readOptionalString();
+            ignoreMalformed = in.readOptionalBoolean();
+        }
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeString(name);
         out.writeString(type);
         script.writeTo(out);
+        if (out.getVersion().onOrAfter(Version.V_2_15_0)) {
+            if (properties == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeMap(properties);
+            }
+            out.writeOptionalString(sourceIndexedField);
+            out.writeOptionalString(format);
+            out.writeOptionalBoolean(ignoreMalformed);
+        }
     }
 
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
         builder.startObject(name);
         builder.field("type", type);
         builder.field("script", script);
+        if (properties != null) {
+            builder.field("properties", properties);
+        }
+        if (sourceIndexedField != null) {
+            builder.field("source_indexed_field", sourceIndexedField);
+        }
+        if (format != null) {
+            builder.field("format", format);
+        }
+        if (ignoreMalformed != null) {
+            builder.field("ignore_malformed", ignoreMalformed);
+        }
         builder.endObject();
         return builder;
     }
@@ -70,9 +106,41 @@ public Script getScript() {
         return script;
     }
 
+    public Map<String, Object> getProperties() {
+        return properties;
+    }
+
+    public String getSourceIndexedField() {
+        return sourceIndexedField;
+    }
+
+    public String getFormat() {
+        return format;
+    }
+
+    public boolean getIgnoreMalformed() {
+        return Boolean.TRUE.equals(ignoreMalformed);
+    }
+
+    public void setProperties(Map<String, Object> properties) {
+        this.properties = properties;
+    }
+
+    public void setSourceIndexedField(String sourceIndexedField) {
+        this.sourceIndexedField = sourceIndexedField;
+    }
+
+    public void setFormat(String format) {
+        this.format = format;
+    }
+
+    public void setIgnoreMalformed(boolean ignoreMalformed) {
+        this.ignoreMalformed = ignoreMalformed;
+    }
+
     @Override
     public int hashCode() {
-        return Objects.hash(name, type, script);
+        return Objects.hash(name, type, script, sourceIndexedField, properties, ignoreMalformed, format);
     }
 
     @Override
@@ -84,7 +152,12 @@ public boolean equals(Object obj) {
             return false;
         }
         DerivedField other = (DerivedField) obj;
-        return Objects.equals(name, other.name) && Objects.equals(type, other.type) && Objects.equals(script, other.script);
+        return Objects.equals(name, other.name)
+            && Objects.equals(type, other.type)
+            && Objects.equals(script, other.script)
+            && Objects.equals(sourceIndexedField, other.sourceIndexedField)
+            && Objects.equals(properties, other.properties)
+            && Objects.equals(ignoreMalformed, other.ignoreMalformed)
+            && Objects.equals(format, other.format);
     }
-
 }
@@ -0,0 +1,181 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.mapper;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.ReaderUtil;
+import org.opensearch.common.Randomness;
+import org.opensearch.common.xcontent.XContentFactory;
+import org.opensearch.common.xcontent.json.JsonXContent;
+import org.opensearch.core.common.bytes.BytesReference;
+import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.search.lookup.SourceLookup;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * This class performs type inference by analyzing the _source documents. It uses a random sample of documents to infer the field type, similar to dynamic mapping type guessing logic.
+ * Unlike guessing based on the first document, where field could be missing, this method generates a random sample to make a more accurate inference.
+ * This approach is especially useful for handling missing fields, which is common in nested fields within derived fields of object types.
+ *
+ * <p>The sample size should be chosen carefully to ensure a high probability of selecting at least one document where the field is present.
+ * However, it's essential to strike a balance because a large sample size can lead to performance issues since each sample document's _source field is loaded and examined until the field is found.
+ *
+ * <p>Determining the sample size ({@code S}) is akin to deciding how many balls to draw from a bin, ensuring a high probability ({@code >=P}) of drawing at least one green ball (documents with the field) from a mixture of {@code R } red balls (documents without the field) and {@code G } green balls:
+ * <pre>{@code
+ * P >= 1 - C(R, S) / C(R + G, S)
+ * }</pre>
+ * Here, {@code C()} represents the binomial coefficient.
+ * For a high confidence level, we aim for {@code P >= 0.95 }. For example, with {@code 10^7 } documents where the field is present in {@code 2% } of them, the sample size {@code S } should be around 149 to achieve a probability of {@code 0.95}.
+ */
+public class FieldTypeInference {
+    private final IndexReader indexReader;
+    private final String indexName;
+    private final MapperService mapperService;
+    // TODO expose using a index setting
+    private int sampleSize;
+    private static final int DEFAULT_SAMPLE_SIZE = 150;
+    private static final int MAX_SAMPLE_SIZE_ALLOWED = 1000;
+
+    public FieldTypeInference(String indexName, MapperService mapperService, IndexReader indexReader) {
+        this.indexName = indexName;
+        this.mapperService = mapperService;
+        this.indexReader = indexReader;
+        this.sampleSize = DEFAULT_SAMPLE_SIZE;
+    }
+
+    public void setSampleSize(int sampleSize) {
+        if (sampleSize > MAX_SAMPLE_SIZE_ALLOWED) {
+            throw new IllegalArgumentException("sample_size should be less than " + MAX_SAMPLE_SIZE_ALLOWED);
+        }
+        this.sampleSize = sampleSize;
+    }
+
+    public int getSampleSize() {
+        return sampleSize;
+    }
+
+    public Mapper infer(ValueFetcher valueFetcher) throws IOException {
+        RandomSourceValuesGenerator valuesGenerator = new RandomSourceValuesGenerator(sampleSize, indexReader, valueFetcher);
+        Mapper inferredMapper = null;
+        while (inferredMapper == null && valuesGenerator.hasNext()) {
+            List<Object> values = valuesGenerator.next();
+            if (values == null || values.isEmpty()) {
+                continue;
+            }
+            // always use first value in case of multi value field to infer type
+            inferredMapper = inferTypeFromObject(values.get(0));
+        }
+        return inferredMapper;
+    }
+
+    private Mapper inferTypeFromObject(Object o) throws IOException {
+        if (o == null) {
+            return null;
+        }
+        DocumentMapper mapper = mapperService.documentMapper();
+        XContentBuilder builder = XContentFactory.jsonBuilder().startObject().field("field", o).endObject();
+        BytesReference bytesReference = BytesReference.bytes(builder);
+        SourceToParse sourceToParse = new SourceToParse(indexName, "_id", bytesReference, JsonXContent.jsonXContent.mediaType());
+        ParsedDocument parsedDocument = mapper.parse(sourceToParse);
+        Mapping mapping = parsedDocument.dynamicMappingsUpdate();
+        return mapping.root.getMapper("field");
+    }
+
+    private static class RandomSourceValuesGenerator implements Iterator<List<Object>> {
+        private final ValueFetcher valueFetcher;
+        private final IndexReader indexReader;
+        private final SourceLookup sourceLookup;
+        private final int[] docs;
+        private int iter;
+        private int leaf;
+        private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;
+
+        public RandomSourceValuesGenerator(int sampleSize, IndexReader indexReader, ValueFetcher valueFetcher) {
+            this.valueFetcher = valueFetcher;
+            this.indexReader = indexReader;
+            sampleSize = Math.min(sampleSize, indexReader.numDocs());
+            this.docs = getSortedRandomNum(
+                sampleSize,
+                indexReader.numDocs(),
+                Math.max(sampleSize, MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES)
+            );
+            this.iter = 0;
+            this.leaf = -1;
+            this.sourceLookup = new SourceLookup();
+            if (hasNext()) {
+                setNextLeaf();
+            }
+        }
+
+        @Override
+        public boolean hasNext() {
+            return iter < docs.length && leaf < indexReader.leaves().size();
+        }
+
+        /**
+         * Ensure hasNext() is called before calling next()
+         */
+        @Override
+        public List<Object> next() {
+            int docID = docs[iter] - indexReader.leaves().get(leaf).docBase;
+            if (docID >= indexReader.leaves().get(leaf).reader().numDocs()) {
+                setNextLeaf();
+            }
+            // deleted docs are getting used to infer type, which should be okay?
+            sourceLookup.setSegmentAndDocument(indexReader.leaves().get(leaf), docs[iter] - indexReader.leaves().get(leaf).docBase);
+            try {
+                iter++;
+                return valueFetcher.fetchValues(sourceLookup);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        private void setNextLeaf() {
+            int readerIndex = ReaderUtil.subIndex(docs[iter], indexReader.leaves());
+            if (readerIndex != leaf) {
+                leaf = readerIndex;
+            } else {
+                // this will only happen when leaves are exhausted and readerIndex will be indexReader.leaves()-1.
+                leaf++;
+            }
+            if (leaf < indexReader.leaves().size()) {
+                valueFetcher.setNextReader(indexReader.leaves().get(leaf));
+            }
+        }
+
+        private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) {
+            Set<Integer> generatedNumbers = new TreeSet<>();
+            Random random = Randomness.get();
+            int itr = 0;
+            if (upperBound <= 10 * sampleSize) {
+                List<Integer> numberList = new ArrayList<>();
+                for (int i = 0; i < upperBound; i++) {
+                    numberList.add(i);
+                }
+                Collections.shuffle(numberList, random);
+                generatedNumbers.addAll(numberList.subList(0, sampleSize));
+            } else {
+                while (generatedNumbers.size() < sampleSize && itr++ < attempts) {
+                    int randomNumber = random.nextInt(upperBound);
+                    generatedNumbers.add(randomNumber);
+                }
+            }
+            return generatedNumbers.stream().mapToInt(Integer::valueOf).toArray();
+        }
+    }
+}
@@ -1004,6 +1004,37 @@ public SearchSourceBuilder derivedField(String name, String type, Script script)
         return this;
     }
 
+    /**
+     * Adds a derived field with the given name with provided type, script and other parameters
+     * @param name name of the derived field
+     * @param type type of the derived field
+     * @param script script associated with derived field
+     * @param properties map of field name and type of field for nested fields within object derived field
+     * @param sourceIndexedField source text field which is indexed to filter documents for better performance
+     * @param format date format
+     * @param ignoreMalformed ignores malformed fields instead of failing search request
+     */
+    public SearchSourceBuilder derivedField(
+        String name,
+        String type,
+        Script script,
+        Map<String, Object> properties,
+        String sourceIndexedField,
+        String format,
+        Boolean ignoreMalformed
+    ) {
+        if (derivedFields == null) {
+            derivedFields = new ArrayList<>();
+        }
+        DerivedField derivedField = new DerivedField(name, type, script);
+        derivedField.setProperties(properties);
+        derivedField.setSourceIndexedField(sourceIndexedField);
+        derivedField.setFormat(format);
+        derivedField.setIgnoreMalformed(ignoreMalformed);
+        derivedFields.add(derivedField);
+        return this;
+    }
+
     /**
      * Sets the boost a specific index or alias will receive when the query is executed
      * against it.