-
Notifications
You must be signed in to change notification settings - Fork 25.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Store arrays offsets for keyword fields natively with synthetic source (
#113757) The keyword doc values field gets an extra sorted doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets zigzag vint encoded into a sorted doc values field. For example, in case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Null values are also supported. For example ["c", "b", null, "c"] results into sorted set doc values: ["b", "c"] with ordinals: 0 and 1. The offset array will be: [1, 0, -1, 1] Empty arrays are also supported by encoding a zigzag vint array of zero elements. Limitations: currently only doc values based array support for keyword field mapper. multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage. With this PR, keyword field array will no longer be stored in ignored source, but array offsets are kept track of in an adjacent sorted doc value field. This only applies if index.mapping.synthetic_source_keep is set to arrays (default for logsdb).
- Loading branch information
Showing
19 changed files
with
1,106 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 113757 | ||
summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source. | ||
area: Mapping | ||
type: enhancement | ||
issues: [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
server/src/main/java/org/elasticsearch/index/mapper/FieldArrayContext.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.index.mapper; | ||
|
||
import org.apache.lucene.document.SortedDocValuesField; | ||
import org.apache.lucene.util.BitUtil; | ||
import org.elasticsearch.common.io.stream.BytesStreamOutput; | ||
import org.elasticsearch.common.io.stream.StreamInput; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
public class FieldArrayContext { | ||
|
||
private final Map<String, Offsets> offsetsPerField = new HashMap<>(); | ||
|
||
void recordOffset(String field, String value) { | ||
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets()); | ||
int nextOffset = arrayOffsets.currentOffset++; | ||
var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2)); | ||
offsets.add(nextOffset); | ||
} | ||
|
||
void recordNull(String field) { | ||
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets()); | ||
int nextOffset = arrayOffsets.currentOffset++; | ||
arrayOffsets.nullValueOffsets.add(nextOffset); | ||
} | ||
|
||
void maybeRecordEmptyArray(String field) { | ||
offsetsPerField.computeIfAbsent(field, k -> new Offsets()); | ||
} | ||
|
||
void addToLuceneDocument(DocumentParserContext context) throws IOException { | ||
for (var entry : offsetsPerField.entrySet()) { | ||
var fieldName = entry.getKey(); | ||
var offset = entry.getValue(); | ||
|
||
int currentOrd = 0; | ||
// This array allows to retain the original ordering of elements in leaf arrays and retain duplicates. | ||
int[] offsetToOrd = new int[offset.currentOffset]; | ||
for (var offsetEntry : offset.valueToOffsets.entrySet()) { | ||
for (var offsetAndLevel : offsetEntry.getValue()) { | ||
offsetToOrd[offsetAndLevel] = currentOrd; | ||
} | ||
currentOrd++; | ||
} | ||
for (var nullOffset : offset.nullValueOffsets) { | ||
offsetToOrd[nullOffset] = -1; | ||
} | ||
|
||
try (var streamOutput = new BytesStreamOutput()) { | ||
// Could just use vint for array length, but this allows for decoding my_field: null as -1 | ||
streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length)); | ||
for (int ord : offsetToOrd) { | ||
streamOutput.writeVInt(BitUtil.zigZagEncode(ord)); | ||
} | ||
context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef())); | ||
} | ||
} | ||
} | ||
|
||
static int[] parseOffsetArray(StreamInput in) throws IOException { | ||
int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())]; | ||
for (int i = 0; i < offsetToOrd.length; i++) { | ||
offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt()); | ||
} | ||
return offsetToOrd; | ||
} | ||
|
||
private static class Offsets { | ||
|
||
int currentOffset; | ||
// Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted, | ||
// (which is in the same order the document gets parsed) so we store offsets in right order. This is the same | ||
// order in what the values get stored in SortedSetDocValues. | ||
final Map<String, List<Integer>> valueToOffsets = new TreeMap<>(); | ||
final List<Integer> nullValueOffsets = new ArrayList<>(2); | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.