[8.x] Store arrays offsets for keyword fields natively with synthetic…

… source Backporting elastic#113757 to 8.x branch. The keyword doc values field gets an extra sorted doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets zigzag vint encoded into a sorted doc values field. For example, in case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Null values are also supported. For example ["c", "b", null, "c"] results into sorted set doc values: ["b", "c"] with ordinals: 0 and 1. The offset array will be: [1, 0, -1, 1] Empty arrays are also supported by encoding a zigzag vint array of zero elements. Limitations: currently only doc values based array support for keyword field mapper. multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage. With this PR, keyword field array will no longer be stored in ignored source, but array offsets are kept track of in an adjacent sorted doc value field. This only applies if index.mapping.synthetic_source_keep is set to arrays (default for logsdb).
martijnvg · Feb 20, 2025 · 43ddfc9 · 43ddfc9
1 parent 1420de7
commit 43ddfc9
Show file tree

Hide file tree

Showing 19 changed files with 1,095 additions and 33 deletions.
diff --git a/docs/changelog/113757.yaml b/docs/changelog/113757.yaml
@@ -0,0 +1,5 @@
+pr: 113757
+summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source.
+area: Mapping
+type: enhancement
+issues: []
diff --git a/rest-api-spec/build.gradle b/rest-api-spec/build.gradle
@@ -276,4 +276,8 @@ tasks.named("yamlRestTestV7CompatTransform").configure({ task ->
   task.skipTest("search.vectors/130_knn_query_nested_search/nested kNN search inner_hits size > 1", "waiting for #118774 backport")
   task.skipTest("search.vectors/110_knn_query_with_filter/PRE_FILTER: pre-filter across multiple aliases", "waiting for #118774 backport")
   task.skipTest("search.vectors/160_knn_query_missing_params/kNN search in a dis_max query - missing num_candidates", "waiting for #118774 backport")
+  task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields")
+  task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively")
+  task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively")
+  task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
 })
diff --git a/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml b/...spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml
@@ -982,7 +982,7 @@ subobjects auto:
   - match: { hits.hits.0._source.foo: 10  }
   - match: { hits.hits.0._source.foo\.bar: 100  }
   - match: { hits.hits.0._source.regular.span.id: "1" }
-  - match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
+  - match: { hits.hits.0._source.regular.trace.id: ["a", "b" ] }
   - match: { hits.hits.1._source.id: 2  }
   - match: { hits.hits.1._source.foo: 20 }
   - match: { hits.hits.1._source.foo\.bar: 200 }

diff --git a/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml b/...c/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml
@@ -1091,7 +1091,7 @@ index param - field ordering:
         index: test
 
   - length: { hits.hits.0._source: 4 }
-  - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
+  - match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
 
 
 ---

diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java
@@ -127,6 +127,7 @@ private static IndexVersion def(int id, Version luceneVersion) {
     public static final IndexVersion INFERENCE_METADATA_FIELDS_BACKPORT = def(8_524_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion LOGSB_OPTIONAL_SORTING_ON_HOST_NAME_BACKPORT = def(8_525_0_00, Version.LUCENE_9_12_1);
     public static final IndexVersion USE_SYNTHETIC_SOURCE_FOR_RECOVERY_BY_DEFAULT_BACKPORT = def(8_526_0_00, Version.LUCENE_9_12_1);
+    public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_KEYWORD = def(8_527_0_00, Version.LUCENE_9_12_1);
     /*
      * STOP! READ THIS FIRST! No, really,
      *        ____ _____ ___  ____  _        ____  _____    _    ____    _____ _   _ ___ ____    _____ ___ ____  ____ _____ _

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java
@@ -154,6 +154,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
 
             executeIndexTimeScripts(context);
 
+            context.processArrayOffsets(context);
             for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
                 metadataMapper.postParse(context);
             }
@@ -519,6 +520,7 @@ private static void throwOnCopyToOnObject(Mapper mapper, List<String> copyToFiel
 
     private static void parseObject(final DocumentParserContext context, String currentFieldName) throws IOException {
         assert currentFieldName != null;
+        context.setImmediateXContentParent(context.parser().currentToken());
         Mapper objectMapper = context.getMapper(currentFieldName);
         if (objectMapper != null) {
             doParseObject(context, currentFieldName, objectMapper);
@@ -611,6 +613,12 @@ private static void throwOnCreateDynamicNestedViaCopyTo(Mapper dynamicObjectMapp
     }
 
     private static void parseArray(DocumentParserContext context, String lastFieldName) throws IOException {
+        // Record previous immediate parent, so that it can be reset after array has been parsed.
+        // This is for recording array offset with synthetic source. Only if the immediate parent is an array,
+        // then the offsets can be accounted accurately.
+        var prev = context.getImmediateXContentParent();
+        context.setImmediateXContentParent(context.parser().currentToken());
+
         Mapper mapper = getLeafMapper(context, lastFieldName);
         if (mapper != null) {
             // There is a concrete mapper for this field already. Need to check if the mapper
@@ -624,6 +632,8 @@ private static void parseArray(DocumentParserContext context, String lastFieldNa
         } else {
             parseArrayDynamic(context, lastFieldName);
         }
+        // Reset previous immediate parent
+        context.setImmediateXContentParent(prev);
     }
 
     private static void parseArrayDynamic(DocumentParserContext context, String currentFieldName) throws IOException {
@@ -688,11 +698,12 @@ private static void parseNonDynamicArray(
         final String lastFieldName,
         String arrayFieldName
     ) throws IOException {
+        boolean supportStoringArrayOffsets = mapper != null && mapper.supportStoringArrayOffsets();
         String fullPath = context.path().pathAsText(arrayFieldName);
 
         // Check if we need to record the array source. This only applies to synthetic source.
         boolean canRemoveSingleLeafElement = false;
-        if (context.canAddIgnoredField()) {
+        if (context.canAddIgnoredField() && supportStoringArrayOffsets == false) {
             Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE;
             boolean objectWithFallbackSyntheticSource = false;
             if (mapper instanceof ObjectMapper objectMapper) {
@@ -736,6 +747,7 @@ private static void parseNonDynamicArray(
 
         XContentParser parser = context.parser();
         XContentParser.Token token;
+        XContentParser.Token previousToken = parser.currentToken();
         int elements = 0;
         while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
             if (token == XContentParser.Token.START_OBJECT) {
@@ -754,6 +766,14 @@ private static void parseNonDynamicArray(
                 elements++;
                 parseValue(context, lastFieldName);
             }
+            previousToken = token;
+        }
+        if (mapper != null
+            && context.canAddIgnoredField()
+            && mapper.supportStoringArrayOffsets()
+            && previousToken == XContentParser.Token.START_ARRAY
+            && context.isImmediateParentAnArray()) {
+            context.getOffSetContext().maybeRecordEmptyArray(mapper.getOffsetFieldName());
         }
         if (elements <= 1 && canRemoveSingleLeafElement) {
             context.removeLastIgnoredField(fullPath);

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java
@@ -91,6 +91,31 @@ public LuceneDocument doc() {
         protected void addDoc(LuceneDocument doc) {
             in.addDoc(doc);
         }
+
+        @Override
+        public void processArrayOffsets(DocumentParserContext context) throws IOException {
+            in.processArrayOffsets(context);
+        }
+
+        @Override
+        public FieldArrayContext getOffSetContext() {
+            return in.getOffSetContext();
+        }
+
+        @Override
+        public void setImmediateXContentParent(XContentParser.Token token) {
+            in.setImmediateXContentParent(token);
+        }
+
+        @Override
+        public XContentParser.Token getImmediateXContentParent() {
+            return in.getImmediateXContentParent();
+        }
+
+        @Override
+        public boolean isImmediateParentAnArray() {
+            return in.isImmediateParentAnArray();
+        }
     }
 
     /**
@@ -141,6 +166,8 @@ private enum Scope {
     private final SeqNoFieldMapper.SequenceIDFields seqID;
     private final Set<String> fieldsAppliedFromTemplates;
 
+    private FieldArrayContext fieldArrayContext;
+
     /**
      * Fields that are copied from values of other fields via copy_to.
      * This per-document state is needed since it is possible
@@ -460,6 +487,33 @@ public boolean isCopyToDestinationField(String name) {
         return copyToFields.contains(name);
     }
 
+    public void processArrayOffsets(DocumentParserContext context) throws IOException {
+        if (fieldArrayContext != null) {
+            fieldArrayContext.addToLuceneDocument(context);
+        }
+    }
+
+    public FieldArrayContext getOffSetContext() {
+        if (fieldArrayContext == null) {
+            fieldArrayContext = new FieldArrayContext();
+        }
+        return fieldArrayContext;
+    }
+
+    private XContentParser.Token lastSetToken;
+
+    public void setImmediateXContentParent(XContentParser.Token token) {
+        this.lastSetToken = token;
+    }
+
+    public XContentParser.Token getImmediateXContentParent() {
+        return lastSetToken;
+    }
+
+    public boolean isImmediateParentAnArray() {
+        return lastSetToken == XContentParser.Token.START_ARRAY;
+    }
+
     /**
      * Add a new mapper dynamically created while parsing.
      *

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.util.BitUtil;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.io.stream.StreamInput;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+public class FieldArrayContext {
+
+    private final Map<String, Offsets> offsetsPerField = new HashMap<>();
+
+    void recordOffset(String field, String value) {
+        Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+        int nextOffset = arrayOffsets.currentOffset++;
+        var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2));
+        offsets.add(nextOffset);
+    }
+
+    void recordNull(String field) {
+        Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+        int nextOffset = arrayOffsets.currentOffset++;
+        arrayOffsets.nullValueOffsets.add(nextOffset);
+    }
+
+    void maybeRecordEmptyArray(String field) {
+        offsetsPerField.computeIfAbsent(field, k -> new Offsets());
+    }
+
+    void addToLuceneDocument(DocumentParserContext context) throws IOException {
+        for (var entry : offsetsPerField.entrySet()) {
+            var fieldName = entry.getKey();
+            var offset = entry.getValue();
+
+            int currentOrd = 0;
+            // This array allows to retain the original ordering of elements in leaf arrays and retain duplicates.
+            int[] offsetToOrd = new int[offset.currentOffset];
+            for (var offsetEntry : offset.valueToOffsets.entrySet()) {
+                for (var offsetAndLevel : offsetEntry.getValue()) {
+                    offsetToOrd[offsetAndLevel] = currentOrd;
+                }
+                currentOrd++;
+            }
+            for (var nullOffset : offset.nullValueOffsets) {
+                offsetToOrd[nullOffset] = -1;
+            }
+
+            try (var streamOutput = new BytesStreamOutput()) {
+                // Could just use vint for array length, but this allows for decoding my_field: null as -1
+                streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length));
+                for (int ord : offsetToOrd) {
+                    streamOutput.writeVInt(BitUtil.zigZagEncode(ord));
+                }
+                context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef()));
+            }
+        }
+    }
+
+    static int[] parseOffsetArray(StreamInput in) throws IOException {
+        int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())];
+        for (int i = 0; i < offsetToOrd.length; i++) {
+            offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt());
+        }
+        return offsetToOrd;
+    }
+
+    private static class Offsets {
+
+        int currentOffset;
+        // Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted,
+        // (which is in the same order the document gets parsed) so we store offsets in right order. This is the same
+        // order in what the values get stored in SortedSetDocValues.
+        final Map<String, List<Integer>> valueToOffsets = new TreeMap<>();
+        final List<Integer> nullValueOffsets = new ArrayList<>(2);
+
+    }
+
+}
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java
@@ -201,15 +201,15 @@ public void parse(DocumentParserContext context) throws IOException {
         }
     }
 
-    private void doParseMultiFields(DocumentParserContext context) throws IOException {
+    protected void doParseMultiFields(DocumentParserContext context) throws IOException {
         context.path().add(leafName());
         for (FieldMapper mapper : builderParams.multiFields.mappers) {
             mapper.parse(context);
         }
         context.path().remove();
     }
 
-    private static void throwIndexingWithScriptParam() {
+    protected static void throwIndexingWithScriptParam() {
         throw new IllegalArgumentException("Cannot index data directly into a field with a [script] parameter");
     }