From ee085be46c1261f8be94ba0fe677d3ff7fe27302 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Fri, 9 Jun 2023 11:48:47 -0700 Subject: [PATCH 01/21] realtime ingestion pre-aggregation for distinct count hll and big decimal --- .../core/common/ObjectSerDeUtilsTest.java | 38 ++ .../DistinctCountHLLValueAggregator.java | 37 +- .../SumPrecisionValueAggregator.java | 33 ++ .../local/aggregator/SumValueAggregator.java | 6 + .../aggregator/ValueAggregatorFactory.java | 9 +- .../mutable/MutableSegmentImpl.java | 43 ++- .../FixedByteSVMutableForwardIndex.java | 40 ++- .../index/forward/ForwardIndexType.java | 20 +- .../v2/builder/BaseSingleTreeBuilder.java | 4 +- .../segment/local/utils/TableConfigUtils.java | 81 ++++- ...leSegmentImplIngestionAggregationTest.java | 330 +++++++++++++++++- .../FixedByteSVMutableForwardIndexTest.java | 70 ++++ .../local/utils/TableConfigUtilsTest.java | 76 +++- .../index/mutable/MutableForwardIndex.java | 12 +- .../org/apache/pinot/spi/data/FieldSpec.java | 2 +- .../pinot/spi/utils/BigDecimalUtils.java | 41 +++ .../pinot/spi/utils/BigDecimalUtilsTest.java | 72 +++- 17 files changed, 843 insertions(+), 71 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java b/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java index ccd9ae3f6fd8..2b5126027932 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java @@ -27,6 +27,8 @@ import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import java.math.BigDecimal; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -44,6 +46,8 @@ import org.apache.pinot.segment.local.customobject.QuantileDigest; import org.apache.pinot.segment.local.customobject.StringLongPair; import org.apache.pinot.segment.local.customobject.ValueLongPair; +import org.apache.pinot.spi.utils.BigDecimalUtils; +import org.testng.Assert; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; @@ -374,4 +378,38 @@ public void testDouble2LongMap() { assertEquals(actual, expected, ERROR_MESSAGE); } } + + @Test + public void testBigDecimalWithMaximumPrecisionSizeInBytes() { + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(18), 10); + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(32), 16); + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(38), 18); + } + + @Test + public void testBigDecimalSerializationWithSize() { + ArrayList bigDecimals = new ArrayList<>(); + bigDecimals.add(new BigDecimal("1000.123456")); + bigDecimals.add(new BigDecimal("1237663")); + bigDecimals.add(new BigDecimal("0.114141622")); + + for (BigDecimal bigDecimal : bigDecimals) { + int bytesNeeded = BigDecimalUtils.byteSize(bigDecimal); + + // Serialize big decimal equal to the target size + byte[] bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded); + BigDecimal bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); + Assert.assertEquals(bigDecimalDeserialized, bigDecimal); + + // Serialize big decimal smaller than the target size + bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded + 2); + bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); + Assert.assertEquals(bigDecimalDeserialized, bigDecimal); + + // Raise exception when trying to serialize a big decimal larger than target size + Assert.assertThrows(IllegalArgumentException.class, () -> { + BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded - 4); + }); + } + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index 2cf26a3ac14f..c0412f20e19c 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -20,6 +20,11 @@ import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.local.utils.CustomSerDeUtils; import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; @@ -28,11 +33,31 @@ public class DistinctCountHLLValueAggregator implements ValueAggregator { public static final DataType AGGREGATED_VALUE_TYPE = DataType.BYTES; - private static final int DEFAULT_LOG2M_BYTE_SIZE = 180; - + private int _log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; + private int _log2MByteSize = 180; // Byte size won't change once we get the initial aggregated value private int _maxByteSize; + public DistinctCountHLLValueAggregator() { + } + + public DistinctCountHLLValueAggregator(List arguments) { + // length 1 means we use the default _log2m of 8 + if (arguments.size() <= 1) { + return; + } + + String log2mLiteral = arguments.get(1).getLiteral().getStringValue(); + Preconditions.checkState(StringUtils.isNumeric(log2mLiteral), "log2m argument must be a numeric literal"); + + _log2m = Integer.parseInt(log2mLiteral); + try { + _log2MByteSize = (new HyperLogLog(_log2m)).getBytes().length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Override public AggregationFunctionType getAggregationType() { return AggregationFunctionType.DISTINCTCOUNTHLL; @@ -51,10 +76,9 @@ public HyperLogLog getInitialAggregatedValue(Object rawValue) { initialValue = deserializeAggregatedValue(bytes); _maxByteSize = Math.max(_maxByteSize, bytes.length); } else { - // TODO: Handle configurable log2m for StarTreeBuilder - initialValue = new HyperLogLog(CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M); + initialValue = new HyperLogLog(_log2m); initialValue.offer(rawValue); - _maxByteSize = Math.max(_maxByteSize, DEFAULT_LOG2M_BYTE_SIZE); + _maxByteSize = Math.max(_maxByteSize, _log2MByteSize); } return initialValue; } @@ -100,6 +124,9 @@ public byte[] serializeAggregatedValue(HyperLogLog value) { @Override public HyperLogLog deserializeAggregatedValue(byte[] bytes) { + if (bytes == null || bytes.length == 0) { + return new HyperLogLog(_log2m); + } return CustomSerDeUtils.HYPER_LOG_LOG_SER_DE.deserialize(bytes); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java index 2aab0deaad93..021e931872f6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java @@ -18,7 +18,11 @@ */ package org.apache.pinot.segment.local.aggregator; +import com.google.common.base.Preconditions; import java.math.BigDecimal; +import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.utils.BigDecimalUtils; @@ -28,6 +32,29 @@ public class SumPrecisionValueAggregator implements ValueAggregator arguments) { + // length 1 means we don't have any caps on maximum precision nor do we have a fixed size then + if (arguments.size() <= 1) { + return; + } + + String precision = arguments.get(1).getLiteral().getStringValue(); + Preconditions.checkState(StringUtils.isNumeric(precision), "precision must be a numeric literal"); + + _fixedSize = BigDecimalUtils.byteSizeForFixedPrecision(Integer.parseInt(precision)); + } @Override public AggregationFunctionType getAggregationType() { @@ -78,11 +105,17 @@ public BigDecimal cloneAggregatedValue(BigDecimal value) { @Override public int getMaxAggregatedValueByteSize() { + if (_fixedSize > 0) { + return _fixedSize; + } return _maxByteSize; } @Override public byte[] serializeAggregatedValue(BigDecimal value) { + if (_fixedSize > 0) { + return BigDecimalUtils.serializeWithSize(value, _fixedSize); + } return BigDecimalUtils.serialize(value); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java index 429b464c5489..23863b4f9523 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java @@ -37,11 +37,17 @@ public DataType getAggregatedValueType() { @Override public Double getInitialAggregatedValue(Number rawValue) { + if (rawValue == null) { + return 0.0; + } return rawValue.doubleValue(); } @Override public Double applyRawValue(Double value, Number rawValue) { + if (rawValue == null) { + return value; + } return value + rawValue.doubleValue(); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/ValueAggregatorFactory.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/ValueAggregatorFactory.java index b4f90c4952de..b348b1ff4c87 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/ValueAggregatorFactory.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/ValueAggregatorFactory.java @@ -18,7 +18,9 @@ */ package org.apache.pinot.segment.local.aggregator; +import java.util.List; import org.apache.datasketches.tuple.aninteger.IntegerSummary; +import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; @@ -37,7 +39,8 @@ private ValueAggregatorFactory() { * @param aggregationType Aggregation type * @return Value aggregator */ - public static ValueAggregator getValueAggregator(AggregationFunctionType aggregationType) { + public static ValueAggregator getValueAggregator(AggregationFunctionType aggregationType, + List arguments) { switch (aggregationType) { case COUNT: return new CountValueAggregator(); @@ -48,7 +51,7 @@ public static ValueAggregator getValueAggregator(AggregationFunctionType aggrega case SUM: return new SumValueAggregator(); case SUMPRECISION: - return new SumPrecisionValueAggregator(); + return new SumPrecisionValueAggregator(arguments); case AVG: return new AvgValueAggregator(); case MINMAXRANGE: @@ -57,7 +60,7 @@ public static ValueAggregator getValueAggregator(AggregationFunctionType aggrega return new DistinctCountBitmapValueAggregator(); case DISTINCTCOUNTHLL: case DISTINCTCOUNTRAWHLL: - return new DistinctCountHLLValueAggregator(); + return new DistinctCountHLLValueAggregator(arguments); case PERCENTILEEST: case PERCENTILERAWEST: return new PercentileEstValueAggregator(); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index 2c6885ace5b8..2317fcec5596 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -257,6 +257,14 @@ public boolean isMutableSegment() { // Initialize for each column for (FieldSpec fieldSpec : _physicalFieldSpecs) { + String fieldSpecName = fieldSpec.getName(); + if (metricsAggregators.containsKey(fieldSpecName)) { + int maxLength = metricsAggregators.get(fieldSpecName).getRight().getMaxAggregatedValueByteSize(); + if (maxLength > 0) { + fieldSpec.setMaxLength(maxLength); + } + } + String column = fieldSpec.getName(); FieldIndexConfigs indexConfigs = Optional.ofNullable(config.getIndexConfigByCol().get(column)) .orElse(FieldIndexConfigs.EMPTY); @@ -290,6 +298,7 @@ public boolean isMutableSegment() { // Only support generating raw index on single-value columns that do not have inverted index while // consuming. After consumption completes and the segment is built, all single-value columns can have raw index + // Dictionary-encoded column MutableDictionary dictionary; if (isDictionary) { @@ -565,6 +574,7 @@ private void updateDictionary(GenericRow row) { for (Map.Entry entry : _indexContainerMap.entrySet()) { IndexContainer indexContainer = entry.getValue(); MutableDictionary dictionary = indexContainer._dictionary; + if (dictionary == null) { continue; } @@ -620,6 +630,10 @@ private void addNewRow(int docId, GenericRow row) { case DOUBLE: forwardIndex.add(((Number) value).doubleValue(), -1, docId); break; + case BIG_DECIMAL: + case BYTES: + forwardIndex.add(indexContainer._valueAggregator.serializeAggregatedValue(value), -1, docId); + break; default: throw new UnsupportedOperationException( "Unsupported data type: " + dataType + " for aggregation: " + column); @@ -796,6 +810,11 @@ private void aggregateMetrics(GenericRow row, int docId) { valueAggregator.getAggregatedValueType(), valueAggregator.getAggregationType(), dataType)); } break; + case BYTES: + Object currentValue = valueAggregator.deserializeAggregatedValue(forwardIndex.getBytes(docId)); + Object newVal = valueAggregator.applyRawValue(currentValue, value); + forwardIndex.setBytes(docId, valueAggregator.serializeAggregatedValue(newVal)); + break; default: throw new UnsupportedOperationException( String.format("Aggregation type %s of %s not supported for %s", valueAggregator.getAggregatedValueType(), @@ -1198,8 +1217,8 @@ private static Map> fromAggregateMetrics(R Map> columnNameToAggregator = new HashMap<>(); for (String metricName : segmentConfig.getSchema().getMetricNames()) { - columnNameToAggregator.put(metricName, - Pair.of(metricName, ValueAggregatorFactory.getValueAggregator(AggregationFunctionType.SUM))); + columnNameToAggregator.put(metricName, Pair.of(metricName, + ValueAggregatorFactory.getValueAggregator(AggregationFunctionType.SUM, Collections.EMPTY_LIST))); } return columnNameToAggregator; } @@ -1216,8 +1235,20 @@ private static Map> fromAggregationConfig( "aggregation function must be a function: %s", config); FunctionContext functionContext = expressionContext.getFunction(); TableConfigUtils.validateIngestionAggregation(functionContext.getFunctionName()); - Preconditions.checkState(functionContext.getArguments().size() == 1, - "aggregation function can only have one argument: %s", config); + + if ("distinctcounthll".equalsIgnoreCase(functionContext.getFunctionName())) { + Preconditions.checkState( + functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, + "distinctcounthll function can have max two arguments: %s", config); + } else if ("sumprecision".equalsIgnoreCase(functionContext.getFunctionName())) { + Preconditions.checkState( + functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 3, + "sumprecision function can have max three arguments: %s", config); + } else { + Preconditions.checkState(functionContext.getArguments().size() == 1, + "aggregation function can only have one argument: %s", config); + } + ExpressionContext argument = functionContext.getArguments().get(0); Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, "aggregator function argument must be a identifier: %s", config); @@ -1225,8 +1256,8 @@ private static Map> fromAggregationConfig( AggregationFunctionType functionType = AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); - columnNameToAggregator.put(config.getColumnName(), - Pair.of(argument.getIdentifier(), ValueAggregatorFactory.getValueAggregator(functionType))); + columnNameToAggregator.put(config.getColumnName(), Pair.of(argument.getIdentifier(), + ValueAggregatorFactory.getValueAggregator(functionType, functionContext.getArguments()))); } return columnNameToAggregator; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java index 13fce48a21f6..75153a460c36 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java @@ -63,15 +63,20 @@ public class FixedByteSVMutableForwardIndex implements MutableForwardIndex { /** * @param storedType Data type of the values + * @param fixedLength Fixed length of values if known: only used for BYTES field and Hyperloglog values for now. * @param numRowsPerChunk Number of rows to pack in one chunk before a new chunk is created. * @param memoryManager Memory manager to be used for allocating memory. * @param allocationContext Allocation allocationContext. */ - public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType storedType, int numRowsPerChunk, - PinotDataBufferMemoryManager memoryManager, String allocationContext) { + public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType storedType, int fixedLength, + int numRowsPerChunk, PinotDataBufferMemoryManager memoryManager, String allocationContext) { _dictionaryEncoded = dictionaryEncoded; _storedType = storedType; - _valueSizeInBytes = storedType.size(); + if (storedType == DataType.BYTES || storedType == DataType.BIG_DECIMAL) { + _valueSizeInBytes = fixedLength; + } else { + _valueSizeInBytes = storedType.size(); + } _numRowsPerChunk = numRowsPerChunk; _chunkSizeInBytes = numRowsPerChunk * _valueSizeInBytes; _memoryManager = memoryManager; @@ -79,6 +84,11 @@ public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType stored addBuffer(); } + public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType valueType, int numRowsPerChunk, + PinotDataBufferMemoryManager memoryManager, String allocationContext) { + this(dictionaryEncoded, valueType, -1, numRowsPerChunk, memoryManager, allocationContext); + } + @Override public boolean isDictionaryEncoded() { return _dictionaryEncoded; @@ -195,6 +205,22 @@ public void setDouble(int docId, double value) { getWriterForRow(docId).setDouble(docId, value); } + @Override + public byte[] getBytes(int docId) { + int bufferId = getBufferId(docId); + return _readers.get(bufferId).getBytes(docId); + } + + @Override + public void setBytes(int docId, byte[] value) { + if (value.length != _valueSizeInBytes) { + throw new IllegalArgumentException("Expected value size to be " + _valueSizeInBytes + " but was " + value.length); + } + + addBufferIfNeeded(docId); + getWriterForRow(docId).setBytes(docId, value); + } + private WriterWithOffset getWriterForRow(int row) { return _writers.get(getBufferId(row)); } @@ -267,6 +293,10 @@ public void setFloat(int row, float value) { public void setDouble(int row, double value) { _writer.setDouble(row - _startRowId, 0, value); } + + public void setBytes(int row, byte[] value) { + _writer.setBytes(row - _startRowId, 0, value); + } } /** @@ -307,6 +337,10 @@ public BigDecimal getBigDecimal(int row) { return BigDecimalUtils.deserialize(_reader.getBytes(row - _startRowId, 0)); } + public byte[] getBytes(int row) { + return _reader.getBytes(row - _startRowId, 0); + } + public FixedByteSingleValueMultiColReader getReader() { return _reader; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index e45cca536e91..51c2afad2011 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -58,6 +58,9 @@ import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.Schema; +import static org.apache.pinot.spi.data.FieldSpec.DataType.BIG_DECIMAL; +import static org.apache.pinot.spi.data.FieldSpec.DataType.BYTES; + public class ForwardIndexType extends AbstractIndexType @@ -269,13 +272,17 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex String column = context.getFieldSpec().getName(); String segmentName = context.getSegmentName(); FieldSpec.DataType storedType = context.getFieldSpec().getDataType().getStoredType(); + int maxLength = context.getFieldSpec().getMaxLength(); boolean isSingleValue = context.getFieldSpec().isSingleValueField(); if (!context.hasDictionary()) { if (isSingleValue) { String allocationContext = IndexUtil.buildAllocationContext(context.getSegmentName(), context.getFieldSpec().getName(), V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - if (storedType.isFixedWidth()) { - return new FixedByteSVMutableForwardIndex(false, storedType, context.getCapacity(), + // We consider BYTES with a specific maxLength as being also fixed width. + // For BIG_DECIMAL, maxLength is maximum size of a serialized big decimal, + // it is determined using the value aggregator if present and does not need to be specified on the schema. + if (isFieldFixed(context.getFieldSpec())) { + return new FixedByteSVMutableForwardIndex(false, storedType, maxLength, context.getCapacity(), context.getMemoryManager(), allocationContext); } else { // RealtimeSegmentStatsHistory does not have the stats for no-dictionary columns from previous consuming @@ -316,4 +323,13 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex } } } + + private boolean isFieldFixed(FieldSpec fieldSpec) { + FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); + int maxLength = fieldSpec.getMaxLength(); + + return (storedType.isFixedWidth() || ( + (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0) + ); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java index 1c1b03832e79..da74a873f7ca 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -142,7 +143,8 @@ static class Record { for (AggregationFunctionColumnPair functionColumnPair : functionColumnPairs) { _metrics[index] = functionColumnPair.toColumnName(); _functionColumnPairs[index] = functionColumnPair; - _valueAggregators[index] = ValueAggregatorFactory.getValueAggregator(functionColumnPair.getFunctionType()); + _valueAggregators[index] = + ValueAggregatorFactory.getValueAggregator(functionColumnPair.getFunctionType(), Collections.EMPTY_LIST); // Ignore the column for COUNT aggregation function if (_valueAggregators[index].getAggregationType() != AggregationFunctionType.COUNT) { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index a64170c60f55..18cfcada4c80 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -18,10 +18,12 @@ */ package org.apache.pinot.segment.local.utils; +import com.clearspring.analytics.stream.cardinality.HyperLogLog; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableSet; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -42,6 +44,8 @@ import org.apache.pinot.common.request.context.RequestContextUtils; import org.apache.pinot.common.tier.TierFactory; import org.apache.pinot.common.utils.config.TagNameUtils; +import org.apache.pinot.segment.local.aggregator.ValueAggregator; +import org.apache.pinot.segment.local.aggregator.ValueAggregatorFactory; import org.apache.pinot.segment.local.function.FunctionEvaluator; import org.apache.pinot.segment.local.function.FunctionEvaluatorFactory; import org.apache.pinot.segment.local.segment.creator.impl.inv.BitSlicedRangeIndexCreator; @@ -106,7 +110,8 @@ private TableConfigUtils() { // hardcode the value here to avoid pulling the entire pinot-kinesis module as dependency. private static final String KINESIS_STREAM_TYPE = "kinesis"; private static final EnumSet SUPPORTED_INGESTION_AGGREGATIONS = - EnumSet.of(SUM, MIN, MAX, COUNT); + EnumSet.of(SUM, MIN, MAX, COUNT, DISTINCTCOUNTHLL, SUMPRECISION); + private static final Set UPSERT_DEDUP_ALLOWED_ROUTING_STRATEGIES = ImmutableSet.of(RoutingConfig.STRICT_REPLICA_GROUP_INSTANCE_SELECTOR_TYPE, RoutingConfig.MULTI_STAGE_REPLICA_GROUP_SELECTOR_TYPE); @@ -351,6 +356,7 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc Set aggregationColumns = new HashSet<>(); for (AggregationConfig aggregationConfig : aggregationConfigs) { String columnName = aggregationConfig.getColumnName(); + FieldSpec fieldSpec = null; String aggregationFunction = aggregationConfig.getAggregationFunction(); if (columnName == null || aggregationFunction == null) { throw new IllegalStateException( @@ -358,7 +364,7 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc } if (schema != null) { - FieldSpec fieldSpec = schema.getFieldSpecFor(columnName); + fieldSpec = schema.getFieldSpecFor(columnName); Preconditions.checkState(fieldSpec != null, "The destination column '" + columnName + "' of the aggregation function must be present in the schema"); Preconditions.checkState(fieldSpec.getFieldType() == FieldSpec.FieldType.METRIC, @@ -380,12 +386,75 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc FunctionContext functionContext = expressionContext.getFunction(); validateIngestionAggregation(functionContext.getFunctionName()); - Preconditions.checkState(functionContext.getArguments().size() == 1, - "aggregation function can only have one argument: %s", aggregationConfig); + + List arguments = functionContext.getArguments(); + + if (("distinctcounthll".equals(functionContext.getFunctionName())) + || ("sumprecision".equals(functionContext.getFunctionName()))) { + String destinationColumn = aggregationConfig.getColumnName(); + FieldSpec destinationFieldSpec = schema.getFieldSpecFor(destinationColumn); + + if (destinationFieldSpec == null) { + throw new RuntimeException("couldn't find field config for " + destinationColumn + + ". Unable to validate aggregation config for " + functionContext.getFunctionName()); + } + int maxLength = destinationFieldSpec.getMaxLength(); + + if ("distinctcounthll".equals(functionContext.getFunctionName())) { + Preconditions.checkState( + functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, + "distinctcounthll function can have max two arguments: %s", aggregationConfig); + + int log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; + if (functionContext.getArguments().size() == 2) { + Preconditions.checkState( + StringUtils.isNumeric(functionContext.getArguments().get(1).getLiteral().getStringValue() + ), + "distinctcounthll function second argument must be a number"); + + log2m = Integer.parseInt(functionContext.getArguments().get(1).getLiteral().getStringValue()); + } + + int expectedBytesForHLL; + try { + expectedBytesForHLL = (new HyperLogLog(log2m)).getBytes().length; + } catch (IOException e) { + throw new RuntimeException(e); + } + + Preconditions.checkState(maxLength == expectedBytesForHLL, + "destination field for distinctcounthll must have maxLength property set to " + + expectedBytesForHLL + + ", the size of a HLL object with log2m of " + log2m); + } else if ("sumprecision".equals(functionContext.getFunctionName())) { + Preconditions.checkState(functionContext.getArguments().size() >= 2 + && functionContext.getArguments().size() <= 3, + "sumprecision must specify precision (required), scale (optional)", + aggregationConfig); + } + } else { + Preconditions.checkState(functionContext.getArguments().size() == 1, + "aggregation function can only have one argument: %s", aggregationConfig); + } ExpressionContext argument = functionContext.getArguments().get(0); Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, - "aggregator function argument must be a identifier: %s", aggregationConfig); + "aggregator function argument must be an identifier: %s", aggregationConfig); + + AggregationFunctionType functionType = + AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); + ValueAggregator valueAggregator = ValueAggregatorFactory.getValueAggregator(functionType, arguments); + + if (schema != null && fieldSpec != null) { + if (functionType.equals(SUMPRECISION)) { + Preconditions.checkState(fieldSpec.getDataType() == DataType.BIG_DECIMAL, + "SUMPRECISION consuming aggregation requires the schema data type to be BIG_DECIMAL"); + } else { + Preconditions.checkState(valueAggregator.getAggregatedValueType() == fieldSpec.getDataType(), + "aggregator function datatype (%s) must be the same as the schema datatype (%s) for %s", + valueAggregator.getAggregatedValueType(), fieldSpec.getDataType(), fieldSpec.getName()); + } + } aggregationSourceColumns.add(argument.getIdentifier()); } @@ -464,7 +533,7 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc */ public static void validateIngestionAggregation(String name) { for (AggregationFunctionType functionType : SUPPORTED_INGESTION_AGGREGATIONS) { - if (functionType.getName().equals(name)) { + if (functionType.getName().toLowerCase().equals(name)) { return; } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java index e0aea45373c7..607174f7d88a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java @@ -18,6 +18,9 @@ */ package org.apache.pinot.segment.local.indexsegment.mutable; +import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; +import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -27,11 +30,15 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import org.apache.pinot.common.request.Literal; +import org.apache.pinot.common.request.context.ExpressionContext; +import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; import org.apache.pinot.spi.config.table.ingestion.AggregationConfig; import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.Schema; import org.apache.pinot.spi.data.readers.GenericRow; import org.apache.pinot.spi.stream.StreamMessageMetadata; +import org.apache.pinot.spi.utils.BigDecimalUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -81,10 +88,10 @@ public void testSameSrcDifferentAggregations() for (List metrics : addRows(1, mutableSegmentImpl)) { expectedMin.put(metrics.get(0).getKey(), Math.min(expectedMin.getOrDefault(metrics.get(0).getKey(), Double.POSITIVE_INFINITY), - metrics.get(0).getValue())); + (Integer) metrics.get(0).getValue())); expectedMax.put(metrics.get(0).getKey(), Math.max(expectedMax.getOrDefault(metrics.get(0).getKey(), Double.NEGATIVE_INFINITY), - metrics.get(0).getValue())); + (Integer) metrics.get(0).getValue())); } GenericRow reuse = new GenericRow(); @@ -115,9 +122,9 @@ public void testSameAggregationDifferentSrc() Map expectedSum2 = new HashMap<>(); for (List metrics : addRows(2, mutableSegmentImpl)) { expectedSum1.put(metrics.get(0).getKey(), - expectedSum1.getOrDefault(metrics.get(0).getKey(), 0) + metrics.get(0).getValue()); + expectedSum1.getOrDefault(metrics.get(0).getKey(), 0) + (Integer) (metrics.get(0).getValue())); expectedSum2.put(metrics.get(1).getKey(), - expectedSum2.getOrDefault(metrics.get(1).getKey(), 0L) + metrics.get(1).getValue().longValue()); + expectedSum2.getOrDefault(metrics.get(1).getKey(), 0L) + ((Integer) metrics.get(1).getValue()).longValue()); } GenericRow reuse = new GenericRow(); @@ -131,6 +138,123 @@ public void testSameAggregationDifferentSrc() mutableSegmentImpl.destroy(); } + + @Test + public void testValuesAreNull() + throws Exception { + String m1 = "sum1"; + + Schema schema = + getSchemaBuilder().addMetric(m1, FieldSpec.DataType.INT).build(); + MutableSegmentImpl mutableSegmentImpl = + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), + VAR_LENGTH_SET, INVERTED_INDEX_SET, + Arrays.asList(new AggregationConfig(m1, "SUM(metric)"))); + + + Set keys = new HashSet<>(); + + long seed = 2; + Random random = new Random(seed); + StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); + + for (int i = 0; i < NUM_ROWS; i++) { + // Generate random int to prevent overflow + GenericRow row = getRow(random, 1); + row.putValue(METRIC, null); + mutableSegmentImpl.index(row, defaultMetadata); + + String key = buildKey(row); + keys.add(key); + } + + int numDocsIndexed = mutableSegmentImpl.getNumDocsIndexed(); + Assert.assertEquals(numDocsIndexed, keys.size()); + + // Assert that aggregation happened. + Assert.assertTrue(numDocsIndexed < NUM_ROWS); + + GenericRow reuse = new GenericRow(); + for (int docId = 0; docId < keys.size(); docId++) { + GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); + String key = buildKey(row); + Assert.assertEquals(row.getValue(m1), 0, key); + } + + mutableSegmentImpl.destroy(); + } + + @Test + public void testDISTINCTCOUNTHLL() throws Exception { + String m1 = "metric_DISTINCTCOUNTHLL"; + Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BYTES).build(); + // Size of hll object for when log2m is 12 + schema.getFieldSpecFor(m1).setMaxLength(2740); + + MutableSegmentImpl mutableSegmentImpl = + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), + VAR_LENGTH_SET, INVERTED_INDEX_SET, + Arrays.asList(new AggregationConfig(m1, "DISTINCTCOUNTHLL(metric, 12)"))); + + Map expected = new HashMap<>(); + List metrics = addRowsDistinctCountHLL(998, mutableSegmentImpl); + for (Metric metric : metrics) { + expected.put(metric.getKey(), (HLLTestData) metric.getValue()); + } + + List arguments = + List.of( + ExpressionContext.forIdentifier("distinctcounthll"), + ExpressionContext.forLiteralContext(Literal.stringValue("12")) + ); + DistinctCountHLLValueAggregator valueAggregator = new DistinctCountHLLValueAggregator(arguments); + + Set integers = new HashSet<>(); + + /* + Assert that the distinct count is within an error margin. We assert on the cardinality of the HLL in the docID + and the + HLL we made, but also on the cardinality of the HLL in the docID and the actual cardinality from the set of + integers. + */ + GenericRow reuse = new GenericRow(); + for (int docId = 0; docId < expected.size(); docId++) { + GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); + String key = buildKey(row); + + integers.addAll(expected.get(key)._integers); + + HyperLogLog expectedHLL = expected.get(key)._hll; + HyperLogLog actualHLL = valueAggregator.deserializeAggregatedValue((byte[]) row.getValue(m1)); + + Assert.assertEquals(actualHLL.cardinality(), expectedHLL.cardinality(), (int) (expectedHLL.cardinality() * 0.04), + "The HLL cardinality from the index is within a tolerable error margin (4%) of the cardinality of the " + + "expected HLL."); + Assert.assertEquals(actualHLL.cardinality(), expected.get(key)._integers.size(), + expected.get(key)._integers.size() * 0.04, + "The HLL cardinality from the index is within a tolerable error margin (4%) of the actual cardinality of " + + "the integers."); + } + + /* + Assert that the aggregated HyperLogLog is also within the error margin + */ + HyperLogLog togetherHLL = new HyperLogLog(12); + expected.forEach((key, value) -> { + try { + togetherHLL.addAll(value._hll); + } catch (CardinalityMergeException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + + Assert.assertEquals(togetherHLL.cardinality(), integers.size(), (int) (integers.size() * 0.04), + "The aggregated HLL cardinality is within a tolerable error margin (4%) of the actual cardinality of the " + + "integers."); + mutableSegmentImpl.destroy(); + } + @Test public void testCOUNT() throws Exception { @@ -166,22 +290,40 @@ private String buildKey(GenericRow row) { TIME_COLUMN1) + KEY_SEPARATOR + row.getValue(TIME_COLUMN2); } - private GenericRow getRow(Random random) { + private GenericRow getRow(Random random, Integer multiplicationFactor) { GenericRow row = new GenericRow(); - row.putValue(DIMENSION_1, random.nextInt(10)); + row.putValue(DIMENSION_1, random.nextInt(2 * multiplicationFactor)); row.putValue(DIMENSION_2, STRING_VALUES.get(random.nextInt(STRING_VALUES.size()))); - row.putValue(TIME_COLUMN1, random.nextInt(10)); - row.putValue(TIME_COLUMN2, random.nextInt(5)); + row.putValue(TIME_COLUMN1, random.nextInt(2 * multiplicationFactor)); + row.putValue(TIME_COLUMN2, random.nextInt(2 * multiplicationFactor)); return row; } + private class HLLTestData { + private HyperLogLog _hll; + private Set _integers; + + public HLLTestData(HyperLogLog hll, Set integers) { + _hll = hll; + _integers = integers; + } + + public HyperLogLog getHll() { + return _hll; + } + + public Set getIntegers() { + return _integers; + } + } + private class Metric { private final String _key; - private final Integer _value; + private final Object _value; - Metric(String key, Integer value) { + Metric(String key, Object value) { _key = key; _value = value; } @@ -190,11 +332,100 @@ public String getKey() { return _key; } - public Integer getValue() { + public Object getValue() { return _value; } } + private List addRowsDistinctCountHLL(long seed, MutableSegmentImpl mutableSegmentImpl) + throws Exception { + List metrics = new ArrayList<>(); + + Random random = new Random(seed); + StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); + + HashMap hllMap = new HashMap<>(); + HashMap> distinctMap = new HashMap<>(); + + Integer rows = 500000; + + for (int i = 0; i < (rows); i++) { + GenericRow row = getRow(random, 1); + String key = buildKey(row); + + int metricValue = random.nextInt(5000000); + row.putValue(METRIC, metricValue); + + if (hllMap.containsKey(key)) { + hllMap.get(key).offer(row.getValue(METRIC)); + distinctMap.get(key).add(metricValue); + } else { + HyperLogLog hll = new HyperLogLog(12); + hll.offer(row.getValue(METRIC)); + hllMap.put(key, hll); + distinctMap.put(key, new HashSet<>(metricValue)); + } + + mutableSegmentImpl.index(row, defaultMetadata); + } + + distinctMap.forEach( + (key, value) -> metrics.add(new Metric(key, new HLLTestData(hllMap.get(key), distinctMap.get(key))))); + + int numDocsIndexed = mutableSegmentImpl.getNumDocsIndexed(); + Assert.assertEquals(numDocsIndexed, hllMap.keySet().size()); + + // Assert that aggregation happened. + Assert.assertTrue(numDocsIndexed < NUM_ROWS); + + return metrics; + } + + private List addRowsSUMPRECISION(long seed, MutableSegmentImpl mutableSegmentImpl) + throws Exception { + List metrics = new ArrayList<>(); + + Random random = new Random(seed); + StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); + + HashMap bdMap = new HashMap<>(); + HashMap> bdIndividualMap = new HashMap<>(); + + Integer rows = 50000; + + for (int i = 0; i < (rows); i++) { + GenericRow row = getRow(random, 1); + String key = buildKey(row); + + BigDecimal metricValue = generateRandomBigDecimal(random, 5, 6); + row.putValue(METRIC, metricValue.toString()); + + if (bdMap.containsKey(key)) { + bdMap.put(key, bdMap.get(key).add(metricValue)); + bdIndividualMap.get(key).add(metricValue); + } else { + bdMap.put(key, metricValue); + ArrayList bdList = new ArrayList<>(); + bdList.add(metricValue); + bdIndividualMap.put(key, bdList); + } + + mutableSegmentImpl.index(row, defaultMetadata); + } + + for (String key : bdMap.keySet()) { + metrics.add(new Metric(key, bdMap.get(key))); + } + + int numDocsIndexed = mutableSegmentImpl.getNumDocsIndexed(); + Assert.assertEquals(numDocsIndexed, bdMap.keySet().size()); + + // Assert that aggregation happened. + Assert.assertTrue(numDocsIndexed < NUM_ROWS); + + return metrics; + } + private List> addRows(long seed, MutableSegmentImpl mutableSegmentImpl) throws Exception { List> metrics = new ArrayList<>(); @@ -205,8 +436,8 @@ private List> addRows(long seed, MutableSegmentImpl mutableSegmentI StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), new GenericRow()); for (int i = 0; i < NUM_ROWS; i++) { - GenericRow row = getRow(random); - // This needs to be relatively low since it will tend to overflow with the Int-to-Double conversion. + // Generate random int to prevent overflow + GenericRow row = getRow(random, 1); Integer metricValue = random.nextInt(10000); Integer metric2Value = random.nextInt(); row.putValue(METRIC, metricValue); @@ -227,4 +458,77 @@ private List> addRows(long seed, MutableSegmentImpl mutableSegmentI return metrics; } + + @Test + public void testSUMPRECISION() throws Exception { + String m1 = "metric_SUMPRECISION"; + Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BIG_DECIMAL).build(); + + MutableSegmentImpl mutableSegmentImpl = + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), + VAR_LENGTH_SET, INVERTED_INDEX_SET, + // Setting precision to 38 in the arguments for SUMPRECISION + Arrays.asList(new AggregationConfig(m1, "SUMPRECISION(metric, 38)"))); + + Map expected = new HashMap<>(); + List metrics = addRowsSUMPRECISION(998, mutableSegmentImpl); + for (Metric metric : metrics) { + expected.put(metric.getKey(), (BigDecimal) metric.getValue()); + } + + /* + Assert that the aggregated values are correct + */ + GenericRow reuse = new GenericRow(); + for (int docId = 0; docId < expected.size(); docId++) { + GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); + String key = buildKey(row); + + BigDecimal expectedBigDecimal = expected.get(key); + BigDecimal actualBigDecimal = (BigDecimal) row.getValue(m1); + + Assert.assertEquals(actualBigDecimal, expectedBigDecimal, + "The aggregated SUM does not match the expected SUM"); + } + mutableSegmentImpl.destroy(); + } + + @Test + public void testBigDecimalTooBig() { + String m1 = "metric_SUMPRECISION"; + Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BIG_DECIMAL).build(); + + int seed = 1; + Random random = new Random(seed); + StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); + + MutableSegmentImpl mutableSegmentImpl = + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), + VAR_LENGTH_SET, INVERTED_INDEX_SET, + Arrays.asList(new AggregationConfig(m1, "SUMPRECISION(metric, 3)"))); + + // Make a big decimal larger than 3 precision and try to index it + BigDecimal large = BigDecimalUtils.generateMaximumNumberWithPrecision(5); + GenericRow row = getRow(random, 1); + + row.putValue("metric", large); + Assert.assertThrows(IllegalArgumentException.class, () -> { + mutableSegmentImpl.index(row, defaultMetadata); + }); + } + + private BigDecimal generateRandomBigDecimal(Random random, int maxPrecision, int scale) { + int precision = 1 + random.nextInt(maxPrecision); + + String s = ""; + for (int i = 0; i < precision; i++) { + s = s + (1 + random.nextInt(9)); + } + + if ((1 + random.nextInt(2)) == 1) { + return (new BigDecimal(s).setScale(scale)).negate(); + } else { + return new BigDecimal(s).setScale(scale); + } + } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java index fbdcefcdb7c1..aa12c792ace6 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.local.segment.index.forward.mutable; +import com.clearspring.analytics.stream.cardinality.HyperLogLog; import java.io.IOException; import java.util.Arrays; import java.util.Random; @@ -113,6 +114,75 @@ private void testDictId(final Random random, final int rows, final int div) readerWriter.close(); } + @Test + public void testBytes() throws IOException { + int rows = 10; + Random r = new Random(); + final long seed = r.nextLong(); + r = new Random(seed); + for (int div = 1; div <= rows / 2; div++) { + testBytes(r, rows, div); + } + } + + private void testBytes(final Random random, final int rows, final int div) throws IOException { + int hllLog2M12Size = 2740; + int log2m = 12; + + FixedByteSVMutableForwardIndex readerWriter; + readerWriter = + new FixedByteSVMutableForwardIndex(false, DataType.BYTES, hllLog2M12Size, rows / div, _memoryManager, "Long"); + byte[][] data = new byte[rows][]; + + for (int i = 0; i < rows; i++) { + HyperLogLog hll = new HyperLogLog(log2m); + hll.offer(random.nextLong()); + data[i] = hll.getBytes(); + Assert.assertEquals(data[i].length, hllLog2M12Size); + readerWriter.setBytes(i, data[i]); + Assert.assertEquals(readerWriter.getBytes(i).length, data[i].length); + Assert.assertEquals(readerWriter.getBytes(i), data[i]); + } + for (int i = 0; i < rows; i++) { + Assert.assertEquals(readerWriter.getBytes(i), data[i]); + } + + // Test mutability by overwriting randomly selected rows. + for (int i = 0; i < rows; i++) { + if (random.nextFloat() >= 0.5) { + HyperLogLog hll = new HyperLogLog(log2m); + hll.offer(random.nextLong()); + data[i] = hll.getBytes(); + readerWriter.setBytes(i, data[i]); + } + } + for (int i = 0; i < rows; i++) { + Assert.assertEquals(readerWriter.getBytes(i), data[i]); + } + + // Write to a large enough row index to ensure multiple chunks are correctly allocated. + int start = rows * 4; + for (int i = 0; i < rows; i++) { + HyperLogLog hll = new HyperLogLog(log2m); + hll.offer(random.nextLong()); + data[i] = hll.getBytes(); + readerWriter.setBytes(start + i, data[i]); + } + + for (int i = 0; i < rows; i++) { + Assert.assertEquals(readerWriter.getBytes(start + i), data[i]); + } + + // Ensure that rows not written default to an empty byte array. + byte[] emptyBytes = new byte[hllLog2M12Size]; + start = rows * 2; + for (int i = 0; i < 2 * rows; i++) { + byte[] bytes = readerWriter.getBytes(start + i); + Assert.assertEquals(bytes, emptyBytes); + } + readerWriter.close(); + } + @Test public void testLong() throws IOException { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java index 9d58b42161bb..a31e4a0cc1ca 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java @@ -472,41 +472,91 @@ public void ingestionAggregationConfigsTest() { // expected } - ingestionConfig.setAggregationConfigs( - Collections.singletonList(new AggregationConfig("m1", "DISTINCTCOUNTHLL(s1)"))); + ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(s1 - s2)"))); try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); - Assert.fail("Should fail due to not supported aggregation function"); + Assert.fail("Should fail due to inner value not being a column"); } catch (IllegalStateException e) { // expected } - ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "s1 + s2"))); + ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(m1)"))); + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + + ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(s1)"))); + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + + schema.addField(new MetricFieldSpec("m2", FieldSpec.DataType.DOUBLE)); try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); - Assert.fail("Should fail due to multiple arguments"); + Assert.fail("Should fail due to one metric column not being aggregated"); } catch (IllegalStateException e) { // expected } - ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(s1 - s2)"))); + schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME).addMetric("d1", FieldSpec.DataType.BYTES).build(); + schema.getFieldSpecFor("d1").setMaxLength(180); + + // distinctcounthllmv is not supported, we expect this to not validate + List aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLLMV(s1)")); + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); + try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); - Assert.fail("Should fail due to inner value not being a column"); + Assert.fail("Should fail due to not supported aggregation function"); } catch (IllegalStateException e) { // expected } - ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(m1)"))); - TableConfigUtils.validateIngestionConfig(tableConfig, schema); +// test the distinctcounthll validation for various log2m sizes + HashMap log2mToExpectedSize = new HashMap<>(); + log2mToExpectedSize.put(8, 180); + log2mToExpectedSize.put(12, 2740); - ingestionConfig.setAggregationConfigs(Collections.singletonList(new AggregationConfig("m1", "SUM(s1)"))); - TableConfigUtils.validateIngestionConfig(tableConfig, schema); + for (Map.Entry entry : log2mToExpectedSize.entrySet()) { + // set the max length to the HLL expected bytes size + schema.getFieldSpecFor("d1").setMaxLength(entry.getValue()); + + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1, " + entry.getKey() + ")")); + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); + + try { + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + } catch (IllegalStateException e) { + Assert.fail( + "The HLL object size based on the log2m doesn't match the destination BYTES field's maxLength property"); + } + } + + // distinctcounthll, expect not specified log2m argument to default to 8 + schema.getFieldSpecFor("d1").setMaxLength(180); + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)")); + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); - schema.addField(new MetricFieldSpec("m2", FieldSpec.DataType.DOUBLE)); try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); - Assert.fail("Should fail due to one metric column not being aggregated"); + } catch (IllegalStateException e) { + Assert.fail("Log2m defaulted to 8 but BYTES schema maxLength is not the expected size of 180"); + } + + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "s1 + s2")); + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); + + try { + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + Assert.fail("Should fail due to multiple arguments"); } catch (IllegalStateException e) { // expected } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/MutableForwardIndex.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/MutableForwardIndex.java index 5b63c989d438..b1fd1f14ae58 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/MutableForwardIndex.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/MutableForwardIndex.java @@ -50,7 +50,17 @@ default void add(@Nonnull Object value, int dictId, int docId) { setDouble(docId, (double) value); break; case BIG_DECIMAL: - setBigDecimal(docId, (BigDecimal) value); + // If the Big Decimal is already serialized as byte[], use it directly. + // This is only possible when the Big Decimal is generated from a realtime pre-aggregation + // where SumPrecisionValueAggregator uses BigDecimalUtils.serializeWithSize() to serialize the value + // instead of the normal BigDecimalUtils.serialize(). + // setBigDecimal() underlying calls BigDecimalUtils.serialize() which is not the intention + // when the Big Decimal is already serialized. + if (value instanceof byte[]) { + setBytes(docId, (byte[]) value); + } else { + setBigDecimal(docId, (BigDecimal) value); + } break; case STRING: setString(docId, (String) value); diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java b/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java index 3c931204a925..0f16359255d8 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java @@ -100,7 +100,7 @@ public abstract class FieldSpec implements Comparable, Serializable { protected DataType _dataType; protected boolean _isSingleValueField = true; - // NOTE: for STRING column, this is the max number of characters; for BYTES column, this is the max number of bytes + // NOTE: for STRING column, this is the max number of characters; for BYTES column, this is the fixed number of bytes private int _maxLength = DEFAULT_MAX_LENGTH; protected Object _defaultNullValue; diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/BigDecimalUtils.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/BigDecimalUtils.java index 6827a7c860a2..c479acee0af5 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/BigDecimalUtils.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/BigDecimalUtils.java @@ -35,6 +35,15 @@ public static int byteSize(BigDecimal value) { return (unscaledValue.bitLength() >>> 3) + 3; } + /** + * This gets the expected byte size of a big decimal with a specific precision. + * It is equal to (ceil(log2(10^precision) - 1) + */ + public static int byteSizeForFixedPrecision(int precision) { + BigDecimal bd = generateMaximumNumberWithPrecision(precision); + return byteSize(bd); + } + /** * Serializes a big decimal to a byte array. */ @@ -49,6 +58,34 @@ public static byte[] serialize(BigDecimal value) { return valueBytes; } + public static byte[] serializeWithSize(BigDecimal value, int fixedSize) { + int scale = value.scale(); + BigInteger unscaledValue = value.unscaledValue(); + byte[] unscaledValueBytes = unscaledValue.toByteArray(); + + int unscaledBytesStartingIndex = fixedSize - unscaledValueBytes.length; + if (unscaledValueBytes.length > (fixedSize - 2)) { + throw new IllegalArgumentException("Big decimal of size " + (unscaledValueBytes.length + 2) + + " is too big to serialize into a fixed size of " + fixedSize + " bytes"); + } + + byte[] valueBytes = new byte[fixedSize]; + valueBytes[0] = (byte) (scale >> 8); + valueBytes[1] = (byte) scale; + + byte paddingByte = 0; + if (value.signum() < 0) { + paddingByte = -1; + } + + for (int i = 2; i < unscaledBytesStartingIndex; i++) { + valueBytes[i] = paddingByte; + } + + System.arraycopy(unscaledValueBytes, 0, valueBytes, unscaledBytesStartingIndex, unscaledValueBytes.length); + return valueBytes; + } + /** * Deserializes a big decimal from a byte array. */ @@ -75,4 +112,8 @@ public static BigDecimal deserialize(ByteBuffer byteBuffer) { byteBuffer.get(bytes); return deserialize(bytes); } + + public static BigDecimal generateMaximumNumberWithPrecision(int precision) { + return (new BigDecimal("10")).pow(precision).subtract(new BigDecimal("1")); + } } diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java index fe9387a14f90..47a83cdb1c7e 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java @@ -29,24 +29,62 @@ public class BigDecimalUtilsTest { @Test public void testBigDecimal() { - BigDecimal value = new BigDecimal("123456789.0123456789"); - byte[] serializedValue = BigDecimalUtils.serialize(value); - assertEquals(BigDecimalUtils.byteSize(value), serializedValue.length); - BigDecimal deserializedValue = BigDecimalUtils.deserialize(serializedValue); - assertEquals(deserializedValue, value); + BigDecimal[] testCases = { + new BigDecimal("0.123456789"), + new BigDecimal("-0.123456789"), + new BigDecimal("123456789"), + new BigDecimal("-123456789"), + new BigDecimal("123456789.0123456789"), + new BigDecimal("-123456789.0123456789"), + // Set the scale to a negative value + new BigDecimal("123456789.0123456789").setScale(-1, RoundingMode.HALF_UP), + new BigDecimal("-123456789.0123456789").setScale(-1, RoundingMode.HALF_UP), + // Set the scale to a negative value in byte + new BigDecimal("123456789.0123456789").setScale(128, RoundingMode.HALF_UP), + new BigDecimal("-123456789.0123456789").setScale(128, RoundingMode.HALF_UP) + }; + for (BigDecimal value : testCases) { + byte[] serializedValue = BigDecimalUtils.serialize(value); + assertEquals(BigDecimalUtils.byteSize(value), serializedValue.length); + BigDecimal deserializedValue = BigDecimalUtils.deserialize(serializedValue); + assertEquals(deserializedValue, value); + } + } - // Set the scale to a negative value - value = value.setScale(-1, RoundingMode.HALF_UP); - serializedValue = BigDecimalUtils.serialize(value); - assertEquals(BigDecimalUtils.byteSize(value), serializedValue.length); - deserializedValue = BigDecimalUtils.deserialize(serializedValue); - assertEquals(deserializedValue, value); + @Test + public void testBigDecimalSerializeWithSize() { + BigDecimal[] testCases = { + new BigDecimal("0.123456789"), + new BigDecimal("-0.123456789"), + new BigDecimal("123456789"), + new BigDecimal("-123456789"), + new BigDecimal("123456789.0123456789"), + new BigDecimal("-123456789.0123456789"), + new BigDecimal("123456789.0123456789").setScale(-1, RoundingMode.HALF_UP), + new BigDecimal("-123456789.0123456789").setScale(-1, RoundingMode.HALF_UP), + new BigDecimal("123456789.0123456789").setScale(128, RoundingMode.HALF_UP), + new BigDecimal("-123456789.0123456789").setScale(128, RoundingMode.HALF_UP) + }; + // One case of serialization with and without padding + int[] sizes = {0, 4}; + for (BigDecimal value : testCases) { + int actualSize = BigDecimalUtils.byteSize(value); + for (int size : sizes) { + byte[] serializedValue = BigDecimalUtils.serializeWithSize(value, actualSize + size); + assertEquals(actualSize + size, serializedValue.length); + BigDecimal deserializedValue = BigDecimalUtils.deserialize(serializedValue); + assertEquals(deserializedValue, value); + } + } + } - // Set the scale to a negative value in byte - value = value.setScale(128, RoundingMode.HALF_UP); - serializedValue = BigDecimalUtils.serialize(value); - assertEquals(BigDecimalUtils.byteSize(value), serializedValue.length); - deserializedValue = BigDecimalUtils.deserialize(serializedValue); - assertEquals(deserializedValue, value); + @Test + public void testGenerateMaximumNumberWithPrecision() { + int[] testCases = { 1, 3, 10, 38, 128 }; + for (int precision : testCases) { + BigDecimal bd = BigDecimalUtils.generateMaximumNumberWithPrecision(precision); + assertEquals(bd.precision(), precision); + assertEquals(bd.add(new BigDecimal("1")).precision(), precision + 1); + } } } From e85cf3ab56238890c248f2544a58bf51b8fb9901 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 22 Jun 2023 10:40:57 -0400 Subject: [PATCH 02/21] fixed when max length is not the default --- .../local/segment/index/forward/ForwardIndexType.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 51c2afad2011..83885a7664c0 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -278,9 +278,6 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex if (isSingleValue) { String allocationContext = IndexUtil.buildAllocationContext(context.getSegmentName(), context.getFieldSpec().getName(), V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - // We consider BYTES with a specific maxLength as being also fixed width. - // For BIG_DECIMAL, maxLength is maximum size of a serialized big decimal, - // it is determined using the value aggregator if present and does not need to be specified on the schema. if (isFieldFixed(context.getFieldSpec())) { return new FixedByteSVMutableForwardIndex(false, storedType, maxLength, context.getCapacity(), context.getMemoryManager(), allocationContext); @@ -324,12 +321,16 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex } } + // We consider BYTES with a specific maxLength as being also fixed width. + // For BIG_DECIMAL, maxLength is maximum size of a serialized big decimal, + // it is determined using the value aggregator if present and does not need to be specified on the schema. private boolean isFieldFixed(FieldSpec fieldSpec) { FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); int maxLength = fieldSpec.getMaxLength(); return (storedType.isFixedWidth() || ( - (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0) + (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0 && + maxLength != FieldSpec.DEFAULT_MAX_LENGTH) ); } } From 638b048a8a60485444c9eb3ad922c42341075bd1 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 22 Jun 2023 10:49:01 -0400 Subject: [PATCH 03/21] lint --- .../segment/local/segment/index/forward/ForwardIndexType.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 83885a7664c0..3b04faa2d8f4 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -329,8 +329,8 @@ private boolean isFieldFixed(FieldSpec fieldSpec) { int maxLength = fieldSpec.getMaxLength(); return (storedType.isFixedWidth() || ( - (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0 && - maxLength != FieldSpec.DEFAULT_MAX_LENGTH) + (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0 + && maxLength != FieldSpec.DEFAULT_MAX_LENGTH) ); } } From 5d8383ac90ed8467893363d3b367b0eb8e2d3e5b Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Tue, 4 Jul 2023 20:16:16 -0400 Subject: [PATCH 04/21] addr review comments, move some tests --- .../core/common/ObjectSerDeUtilsTest.java | 38 ------------------- .../DistinctCountHLLValueAggregator.java | 14 +++---- .../pinot/spi/utils/BigDecimalUtilsTest.java | 36 ++++++++++++++++++ 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java b/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java index 2b5126027932..ccd9ae3f6fd8 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/common/ObjectSerDeUtilsTest.java @@ -27,8 +27,6 @@ import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; -import java.math.BigDecimal; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -46,8 +44,6 @@ import org.apache.pinot.segment.local.customobject.QuantileDigest; import org.apache.pinot.segment.local.customobject.StringLongPair; import org.apache.pinot.segment.local.customobject.ValueLongPair; -import org.apache.pinot.spi.utils.BigDecimalUtils; -import org.testng.Assert; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; @@ -378,38 +374,4 @@ public void testDouble2LongMap() { assertEquals(actual, expected, ERROR_MESSAGE); } } - - @Test - public void testBigDecimalWithMaximumPrecisionSizeInBytes() { - Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(18), 10); - Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(32), 16); - Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(38), 18); - } - - @Test - public void testBigDecimalSerializationWithSize() { - ArrayList bigDecimals = new ArrayList<>(); - bigDecimals.add(new BigDecimal("1000.123456")); - bigDecimals.add(new BigDecimal("1237663")); - bigDecimals.add(new BigDecimal("0.114141622")); - - for (BigDecimal bigDecimal : bigDecimals) { - int bytesNeeded = BigDecimalUtils.byteSize(bigDecimal); - - // Serialize big decimal equal to the target size - byte[] bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded); - BigDecimal bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); - Assert.assertEquals(bigDecimalDeserialized, bigDecimal); - - // Serialize big decimal smaller than the target size - bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded + 2); - bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); - Assert.assertEquals(bigDecimalDeserialized, bigDecimal); - - // Raise exception when trying to serialize a big decimal larger than target size - Assert.assertThrows(IllegalArgumentException.class, () -> { - BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded - 4); - }); - } - } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index c0412f20e19c..4850cc384f73 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -34,7 +34,7 @@ public class DistinctCountHLLValueAggregator implements ValueAggregator { public static final DataType AGGREGATED_VALUE_TYPE = DataType.BYTES; private int _log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; - private int _log2MByteSize = 180; + private int _log2mByteSize = 180; // Byte size won't change once we get the initial aggregated value private int _maxByteSize; @@ -47,12 +47,12 @@ public DistinctCountHLLValueAggregator(List arguments) { return; } - String log2mLiteral = arguments.get(1).getLiteral().getStringValue(); - Preconditions.checkState(StringUtils.isNumeric(log2mLiteral), "log2m argument must be a numeric literal"); - - _log2m = Integer.parseInt(log2mLiteral); try { - _log2MByteSize = (new HyperLogLog(_log2m)).getBytes().length; + String log2mLit = arguments.get(1).getLiteral().getStringValue(); + Preconditions.checkState(StringUtils.isNumeric(log2mLit), "log2m argument must be a numeric literal"); + + _log2m = Integer.parseInt(log2mLit); + _log2mByteSize = (new HyperLogLog(_log2m)).getBytes().length; } catch (IOException e) { throw new RuntimeException(e); } @@ -78,7 +78,7 @@ public HyperLogLog getInitialAggregatedValue(Object rawValue) { } else { initialValue = new HyperLogLog(_log2m); initialValue.offer(rawValue); - _maxByteSize = Math.max(_maxByteSize, _log2MByteSize); + _maxByteSize = Math.max(_maxByteSize, _log2mByteSize); } return initialValue; } diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java index 47a83cdb1c7e..f06afeda3682 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/BigDecimalUtilsTest.java @@ -20,6 +20,8 @@ import java.math.BigDecimal; import java.math.RoundingMode; +import java.util.ArrayList; +import org.testng.Assert; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; @@ -87,4 +89,38 @@ public void testGenerateMaximumNumberWithPrecision() { assertEquals(bd.add(new BigDecimal("1")).precision(), precision + 1); } } + + @Test + public void testBigDecimalWithMaximumPrecisionSizeInBytes() { + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(18), 10); + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(32), 16); + Assert.assertEquals(BigDecimalUtils.byteSizeForFixedPrecision(38), 18); + } + + @Test + public void testBigDecimalSerializationWithSize() { + ArrayList bigDecimals = new ArrayList<>(); + bigDecimals.add(new BigDecimal("1000.123456")); + bigDecimals.add(new BigDecimal("1237663")); + bigDecimals.add(new BigDecimal("0.114141622")); + + for (BigDecimal bigDecimal : bigDecimals) { + int bytesNeeded = BigDecimalUtils.byteSize(bigDecimal); + + // Serialize big decimal equal to the target size + byte[] bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded); + BigDecimal bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); + Assert.assertEquals(bigDecimalDeserialized, bigDecimal); + + // Serialize big decimal smaller than the target size + bytes = BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded + 2); + bigDecimalDeserialized = BigDecimalUtils.deserialize(bytes); + Assert.assertEquals(bigDecimalDeserialized, bigDecimal); + + // Raise exception when trying to serialize a big decimal larger than target size + Assert.assertThrows(IllegalArgumentException.class, () -> { + BigDecimalUtils.serializeWithSize(bigDecimal, bytesNeeded - 4); + }); + } + } } From 74c52793c219be9079c86d15b1316055ef7994b1 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 16:20:40 -0400 Subject: [PATCH 05/21] public DistinctCountHLLValueAggregator() not used in prod --- .../core/startree/v2/DistinctCountHLLStarTreeV2Test.java | 3 ++- .../v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java | 5 ++++- .../local/aggregator/DistinctCountHLLValueAggregator.java | 3 --- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java index d903f505b413..32ad90434556 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java @@ -19,6 +19,7 @@ package org.apache.pinot.core.startree.v2; import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import java.util.ArrayList; import java.util.Random; import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; @@ -31,7 +32,7 @@ public class DistinctCountHLLStarTreeV2Test extends BaseStarTreeV2Test getValueAggregator() { - return new DistinctCountHLLValueAggregator(); + return new DistinctCountHLLValueAggregator(new ArrayList<>()); } @Override diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java index 354497b9cb75..398b5a816920 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java @@ -19,7 +19,10 @@ package org.apache.pinot.core.startree.v2; import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import java.util.ArrayList; +import java.util.List; import java.util.Random; +import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.core.common.ObjectSerDeUtils; import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; @@ -34,7 +37,7 @@ public class PreAggregatedDistinctCountHLLStarTreeV2Test extends BaseStarTreeV2T @Override ValueAggregator getValueAggregator() { - return new DistinctCountHLLValueAggregator(); + return new DistinctCountHLLValueAggregator(new ArrayList<>()); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index 4850cc384f73..23bf329aa573 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -38,9 +38,6 @@ public class DistinctCountHLLValueAggregator implements ValueAggregator arguments) { // length 1 means we use the default _log2m of 8 if (arguments.size() <= 1) { From 2b422353b7ab20450953e957cb355455a4f54c0b Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:36:16 -0400 Subject: [PATCH 06/21] Address review comments, cleanup, don't rely on schema --- .../DistinctCountHLLValueAggregator.java | 14 +++--- .../SumPrecisionValueAggregator.java | 15 +++---- .../local/aggregator/SumValueAggregator.java | 6 --- .../mutable/MutableSegmentImpl.java | 43 +++++++------------ .../FixedByteSVMutableForwardIndex.java | 8 ++-- .../index/forward/ForwardIndexType.java | 18 +++----- .../segment/local/utils/TableConfigUtils.java | 8 ++-- .../mutable/provider/MutableIndexContext.java | 16 ++++++- 8 files changed, 56 insertions(+), 72 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index 23bf329aa573..3248b0ff088b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -29,6 +29,7 @@ import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.utils.CommonConstants; +import org.apache.pinot.spi.utils.HyperLogLogUtils; public class DistinctCountHLLValueAggregator implements ValueAggregator { @@ -45,12 +46,10 @@ public DistinctCountHLLValueAggregator(List arguments) { } try { - String log2mLit = arguments.get(1).getLiteral().getStringValue(); - Preconditions.checkState(StringUtils.isNumeric(log2mLit), "log2m argument must be a numeric literal"); - - _log2m = Integer.parseInt(log2mLit); - _log2mByteSize = (new HyperLogLog(_log2m)).getBytes().length; - } catch (IOException e) { + _log2m = arguments.get(1).getLiteral().getIntValue(); + _log2mByteSize = HyperLogLogUtils.byteSize(_log2m); + _maxByteSize = _log2mByteSize; + } catch (Exception e) { throw new RuntimeException(e); } } @@ -121,9 +120,6 @@ public byte[] serializeAggregatedValue(HyperLogLog value) { @Override public HyperLogLog deserializeAggregatedValue(byte[] bytes) { - if (bytes == null || bytes.length == 0) { - return new HyperLogLog(_log2m); - } return CustomSerDeUtils.HYPER_LOG_LOG_SER_DE.deserialize(bytes); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java index 021e931872f6..d974f096f8ad 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java @@ -38,11 +38,11 @@ public SumPrecisionValueAggregator() { } /* - Aggregate with a optimal maximum precision in mind. Scale is always only 1 32-bit - int and the storing of the scale value does not affect the size of the big decimal. - Given this, we won't care about scale in terms of the aggregations. - During query time, the optional scale parameter can be provided, but during aggregation, - we don't limit it. + Aggregate with a optimal maximum precision in mind. Scale is always only 1 32-bit + int and the storing of the scale value does not affect the size of the big decimal. + Given this, we won't care about scale in terms of the aggregations. + During query time, the optional scale parameter can be provided, but during aggregation, + we don't limit it. */ public SumPrecisionValueAggregator(List arguments) { // length 1 means we don't have any caps on maximum precision nor do we have a fixed size then @@ -50,10 +50,7 @@ public SumPrecisionValueAggregator(List arguments) { return; } - String precision = arguments.get(1).getLiteral().getStringValue(); - Preconditions.checkState(StringUtils.isNumeric(precision), "precision must be a numeric literal"); - - _fixedSize = BigDecimalUtils.byteSizeForFixedPrecision(Integer.parseInt(precision)); + _fixedSize = BigDecimalUtils.byteSizeForFixedPrecision(arguments.get(1).getLiteral().getIntValue()); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java index 23863b4f9523..429b464c5489 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumValueAggregator.java @@ -37,17 +37,11 @@ public DataType getAggregatedValueType() { @Override public Double getInitialAggregatedValue(Number rawValue) { - if (rawValue == null) { - return 0.0; - } return rawValue.doubleValue(); } @Override public Double applyRawValue(Double value, Number rawValue) { - if (rawValue == null) { - return value; - } return value + rawValue.doubleValue(); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index 2317fcec5596..9b0e2b7b76ab 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -111,6 +111,7 @@ import org.slf4j.LoggerFactory; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.pinot.spi.data.FieldSpec.DataType.BIG_DECIMAL; import static org.apache.pinot.spi.data.FieldSpec.DataType.BYTES; import static org.apache.pinot.spi.data.FieldSpec.DataType.STRING; @@ -257,15 +258,20 @@ public boolean isMutableSegment() { // Initialize for each column for (FieldSpec fieldSpec : _physicalFieldSpecs) { - String fieldSpecName = fieldSpec.getName(); - if (metricsAggregators.containsKey(fieldSpecName)) { - int maxLength = metricsAggregators.get(fieldSpecName).getRight().getMaxAggregatedValueByteSize(); - if (maxLength > 0) { - fieldSpec.setMaxLength(maxLength); - } + String column = fieldSpec.getName(); + + int fixedByteSize = -1; + boolean consumingAggregatedMetric = false; + if (metricsAggregators.containsKey(column)) { + fixedByteSize = metricsAggregators.get(column).getRight().getMaxAggregatedValueByteSize(); + consumingAggregatedMetric = true; } + // We consider fields whose values have a fixed size to be fixed width fields. + FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); + boolean isFieldFixed = (storedType.isFixedWidth() || ( + (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && fixedByteSize > 0 && consumingAggregatedMetric) + ); - String column = fieldSpec.getName(); FieldIndexConfigs indexConfigs = Optional.ofNullable(config.getIndexConfigByCol().get(column)) .orElse(FieldIndexConfigs.EMPTY); boolean isDictionary = !isNoDictionaryColumn(indexConfigs, fieldSpec, column); @@ -276,6 +282,7 @@ public boolean isMutableSegment() { .withEstimatedColSize(_statsHistory.getEstimatedAvgColSize(column)) .withAvgNumMultiValues(_statsHistory.getEstimatedAvgColSize(column)) .withConsumerDir(config.getConsumerDir() != null ? new File(config.getConsumerDir()) : null) + .withFixedLengthBytes(isFieldFixed ? fixedByteSize : -1) .build(); // Partition info @@ -298,7 +305,6 @@ public boolean isMutableSegment() { // Only support generating raw index on single-value columns that do not have inverted index while // consuming. After consumption completes and the segment is built, all single-value columns can have raw index - // Dictionary-encoded column MutableDictionary dictionary; if (isDictionary) { @@ -578,7 +584,6 @@ private void updateDictionary(GenericRow row) { if (dictionary == null) { continue; } - Object value = row.getValue(entry.getKey()); if (value == null) { recordIndexingError("DICTIONARY"); @@ -613,10 +618,7 @@ private void addNewRow(int docId, GenericRow row) { DataType dataType = fieldSpec.getDataType(); value = indexContainer._valueAggregator.getInitialAggregatedValue(value); - // aggregator value has to be numeric, but can be a different type of Number from the one expected on the column - // therefore we need to do some value changes here. - // TODO: Precision may change from one value to other, so we may need to study if this is actually what we want - // to do + // BIG_DECIMAL is actually stored as byte[] and hence can be supported here. switch (dataType.getStoredType()) { case INT: forwardIndex.add(((Number) value).intValue(), -1, docId); @@ -1218,7 +1220,7 @@ private static Map> fromAggregateMetrics(R Map> columnNameToAggregator = new HashMap<>(); for (String metricName : segmentConfig.getSchema().getMetricNames()) { columnNameToAggregator.put(metricName, Pair.of(metricName, - ValueAggregatorFactory.getValueAggregator(AggregationFunctionType.SUM, Collections.EMPTY_LIST))); + ValueAggregatorFactory.getValueAggregator(AggregationFunctionType.SUM, Collections.emptyList()))); } return columnNameToAggregator; } @@ -1236,19 +1238,6 @@ private static Map> fromAggregationConfig( FunctionContext functionContext = expressionContext.getFunction(); TableConfigUtils.validateIngestionAggregation(functionContext.getFunctionName()); - if ("distinctcounthll".equalsIgnoreCase(functionContext.getFunctionName())) { - Preconditions.checkState( - functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, - "distinctcounthll function can have max two arguments: %s", config); - } else if ("sumprecision".equalsIgnoreCase(functionContext.getFunctionName())) { - Preconditions.checkState( - functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 3, - "sumprecision function can have max three arguments: %s", config); - } else { - Preconditions.checkState(functionContext.getArguments().size() == 1, - "aggregation function can only have one argument: %s", config); - } - ExpressionContext argument = functionContext.getArguments().get(0); Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, "aggregator function argument must be a identifier: %s", config); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java index 75153a460c36..8e0982e0497b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.local.realtime.impl.forward; +import com.google.common.base.Preconditions; import java.io.Closeable; import java.io.IOException; import java.math.BigDecimal; @@ -63,7 +64,7 @@ public class FixedByteSVMutableForwardIndex implements MutableForwardIndex { /** * @param storedType Data type of the values - * @param fixedLength Fixed length of values if known: only used for BYTES field and Hyperloglog values for now. + * @param fixedLength Fixed length of values if known: only used for BYTES field (HyperLogLog and BigDecimal storage) * @param numRowsPerChunk Number of rows to pack in one chunk before a new chunk is created. * @param memoryManager Memory manager to be used for allocating memory. * @param allocationContext Allocation allocationContext. @@ -73,6 +74,7 @@ public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType stored _dictionaryEncoded = dictionaryEncoded; _storedType = storedType; if (storedType == DataType.BYTES || storedType == DataType.BIG_DECIMAL) { + Preconditions.checkState(fixedLength > 0, "Fixed length must be positive for BYTES and BIG_DECIMAL"); _valueSizeInBytes = fixedLength; } else { _valueSizeInBytes = storedType.size(); @@ -213,9 +215,7 @@ public byte[] getBytes(int docId) { @Override public void setBytes(int docId, byte[] value) { - if (value.length != _valueSizeInBytes) { - throw new IllegalArgumentException("Expected value size to be " + _valueSizeInBytes + " but was " + value.length); - } + Preconditions.checkArgument(value.length == _valueSizeInBytes, "Expected value size to be: %s but got: %s ", _valueSizeInBytes, value.length); addBufferIfNeeded(docId); getWriterForRow(docId).setBytes(docId, value); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 3b04faa2d8f4..ae202826ea8b 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -272,14 +272,14 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex String column = context.getFieldSpec().getName(); String segmentName = context.getSegmentName(); FieldSpec.DataType storedType = context.getFieldSpec().getDataType().getStoredType(); - int maxLength = context.getFieldSpec().getMaxLength(); + int fixedLengthBytes = context.getFixedLengthBytes(); boolean isSingleValue = context.getFieldSpec().isSingleValueField(); if (!context.hasDictionary()) { if (isSingleValue) { String allocationContext = IndexUtil.buildAllocationContext(context.getSegmentName(), context.getFieldSpec().getName(), V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - if (isFieldFixed(context.getFieldSpec())) { - return new FixedByteSVMutableForwardIndex(false, storedType, maxLength, context.getCapacity(), + if (isFieldFixed(context.getFieldSpec(), fixedLengthBytes)) { + return new FixedByteSVMutableForwardIndex(false, storedType, fixedLengthBytes, context.getCapacity(), context.getMemoryManager(), allocationContext); } else { // RealtimeSegmentStatsHistory does not have the stats for no-dictionary columns from previous consuming @@ -321,16 +321,12 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex } } - // We consider BYTES with a specific maxLength as being also fixed width. - // For BIG_DECIMAL, maxLength is maximum size of a serialized big decimal, - // it is determined using the value aggregator if present and does not need to be specified on the schema. - private boolean isFieldFixed(FieldSpec fieldSpec) { + // We consider fields whose values have a fixed size to be fixed width fields. + private boolean isFieldFixed(FieldSpec fieldSpec, int fixedLengthBytes) { FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); - int maxLength = fieldSpec.getMaxLength(); - return (storedType.isFixedWidth() || ( - (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && maxLength > 0 - && maxLength != FieldSpec.DEFAULT_MAX_LENGTH) + (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && fixedLengthBytes > 0 + && fixedLengthBytes != FieldSpec.DEFAULT_MAX_LENGTH) ); } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index 18cfcada4c80..723f17a76197 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -389,8 +389,8 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc List arguments = functionContext.getArguments(); - if (("distinctcounthll".equals(functionContext.getFunctionName())) - || ("sumprecision".equals(functionContext.getFunctionName()))) { + if ((DISTINCTCOUNTHLL.name().toLowerCase().equals(functionContext.getFunctionName())) + || (SUMPRECISION.name().toLowerCase().equals(functionContext.getFunctionName()))) { String destinationColumn = aggregationConfig.getColumnName(); FieldSpec destinationFieldSpec = schema.getFieldSpecFor(destinationColumn); @@ -400,7 +400,7 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc } int maxLength = destinationFieldSpec.getMaxLength(); - if ("distinctcounthll".equals(functionContext.getFunctionName())) { + if (DISTINCTCOUNTHLL.name().toLowerCase().equals(functionContext.getFunctionName())) { Preconditions.checkState( functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, "distinctcounthll function can have max two arguments: %s", aggregationConfig); @@ -426,7 +426,7 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc "destination field for distinctcounthll must have maxLength property set to " + expectedBytesForHLL + ", the size of a HLL object with log2m of " + log2m); - } else if ("sumprecision".equals(functionContext.getFunctionName())) { + } else if (SUMPRECISION.name().toLowerCase().equals(functionContext.getFunctionName())) { Preconditions.checkState(functionContext.getArguments().size() >= 2 && functionContext.getArguments().size() <= 3, "sumprecision must specify precision (required), scale (optional)", diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/provider/MutableIndexContext.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/provider/MutableIndexContext.java index f6ffdc128e51..ab152c0fd285 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/provider/MutableIndexContext.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/provider/MutableIndexContext.java @@ -28,6 +28,7 @@ public class MutableIndexContext { private final int _capacity; private final FieldSpec _fieldSpec; + private final int _fixedLengthBytes; private final boolean _hasDictionary; private final boolean _offHeap; private final int _estimatedColSize; @@ -37,10 +38,11 @@ public class MutableIndexContext { private final PinotDataBufferMemoryManager _memoryManager; private final File _consumerDir; - public MutableIndexContext(FieldSpec fieldSpec, boolean hasDictionary, String segmentName, + public MutableIndexContext(FieldSpec fieldSpec, int fixedLengthBytes, boolean hasDictionary, String segmentName, PinotDataBufferMemoryManager memoryManager, int capacity, boolean offHeap, int estimatedColSize, int estimatedCardinality, int avgNumMultiValues, File consumerDir) { _fieldSpec = fieldSpec; + _fixedLengthBytes = fixedLengthBytes; _hasDictionary = hasDictionary; _segmentName = segmentName; _memoryManager = memoryManager; @@ -64,6 +66,10 @@ public FieldSpec getFieldSpec() { return _fieldSpec; } + public int getFixedLengthBytes() { + return _fixedLengthBytes; + } + public boolean hasDictionary() { return _hasDictionary; } @@ -99,6 +105,7 @@ public static Builder builder() { public static class Builder { private FieldSpec _fieldSpec; + private int _fixedLengthBytes; private String _segmentName; private boolean _hasDictionary = true; private boolean _offHeap = true; @@ -160,8 +167,13 @@ public Builder withConsumerDir(File consumerDir) { return this; } + public Builder withFixedLengthBytes(int fixedLengthBytes) { + _fixedLengthBytes = fixedLengthBytes; + return this; + } + public MutableIndexContext build() { - return new MutableIndexContext(Objects.requireNonNull(_fieldSpec), _hasDictionary, + return new MutableIndexContext(Objects.requireNonNull(_fieldSpec), _fixedLengthBytes, _hasDictionary, Objects.requireNonNull(_segmentName), Objects.requireNonNull(_memoryManager), _capacity, _offHeap, _estimatedColSize, _estimatedCardinality, _avgNumMultiValues, _consumerDir); } From 57377496f28156efac4c93479e85a3651dd85697 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:39:58 -0400 Subject: [PATCH 07/21] remove unused --- .../local/segment/index/forward/ForwardIndexType.java | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index ae202826ea8b..60c5e2dead65 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -278,7 +278,7 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex if (isSingleValue) { String allocationContext = IndexUtil.buildAllocationContext(context.getSegmentName(), context.getFieldSpec().getName(), V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - if (isFieldFixed(context.getFieldSpec(), fixedLengthBytes)) { + if (fixedLengthBytes > 0) { return new FixedByteSVMutableForwardIndex(false, storedType, fixedLengthBytes, context.getCapacity(), context.getMemoryManager(), allocationContext); } else { @@ -320,13 +320,4 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex } } } - - // We consider fields whose values have a fixed size to be fixed width fields. - private boolean isFieldFixed(FieldSpec fieldSpec, int fixedLengthBytes) { - FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); - return (storedType.isFixedWidth() || ( - (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && fixedLengthBytes > 0 - && fixedLengthBytes != FieldSpec.DEFAULT_MAX_LENGTH) - ); - } } From 529cb046c89e3511e2b1666ee0dbef9e29d80384 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:44:23 -0400 Subject: [PATCH 08/21] util --- .../v2/SumPrecisionStarTreeV2Test.java | 3 ++- .../SumPrecisionValueAggregator.java | 5 +--- pinot-spi/pom.xml | 5 ++++ .../pinot/spi/utils/HyperLogLogUtils.java | 23 +++++++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java index f99aa3df1f0f..06390faafeb0 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java @@ -19,6 +19,7 @@ package org.apache.pinot.core.startree.v2; import java.math.BigDecimal; +import java.util.ArrayList; import java.util.Random; import org.apache.pinot.segment.local.aggregator.SumPrecisionValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; @@ -31,7 +32,7 @@ public class SumPrecisionStarTreeV2Test extends BaseStarTreeV2Test getValueAggregator() { - return new SumPrecisionValueAggregator(); + return new SumPrecisionValueAggregator(new ArrayList<>()); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java index d974f096f8ad..5b6337baf39e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java @@ -34,11 +34,8 @@ public class SumPrecisionValueAggregator implements ValueAggregatororg.reflections reflections + + com.clearspring.analytics + stream + 2.7.0 + diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java new file mode 100644 index 000000000000..ddc98181c525 --- /dev/null +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java @@ -0,0 +1,23 @@ +package org.apache.pinot.spi.utils; + +import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import com.clearspring.analytics.stream.cardinality.RegisterSet; + +public class HyperLogLogUtils { + private HyperLogLogUtils() { + } + + /** + * Returns the byte size of the given HyperLogLog. + */ + public static int byteSize(HyperLogLog value) { + return value.getBytes().length; + } + + /** + * Returns the byte size of hyperloglog of a given log2m. + */ + public static int byteSize(int log2m) { + return (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; + } +} \ No newline at end of file From f69845ceecdcc350b4e32b5e4f74a6439da20afc Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:49:44 -0400 Subject: [PATCH 09/21] util and test --- .../pinot/spi/utils/HyperLogLogUtils.java | 20 ++++++++- .../pinot/spi/utils/HyperLogLogUtilsTest.java | 45 +++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java index ddc98181c525..bde35e2cbff9 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.pinot.spi.utils; import com.clearspring.analytics.stream.cardinality.HyperLogLog; @@ -11,7 +29,7 @@ private HyperLogLogUtils() { * Returns the byte size of the given HyperLogLog. */ public static int byteSize(HyperLogLog value) { - return value.getBytes().length; + return value.sizeof(); } /** diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java new file mode 100644 index 000000000000..6dfc1a88ed77 --- /dev/null +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.spi.utils; + +import com.clearspring.analytics.stream.cardinality.RegisterSet; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + +public class HyperLogLogUtilsTest { + + @Test + public void testByteSizeLog2M() { + int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + for (int log2m : testCases) { + int expectedByteSize = (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; + assertEquals(HyperLogLogUtils.byteSize(log2m), expectedByteSize); + } + } + + @Test + public void testByteSizeWithHLLObject() { + int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + for (int log2m : testCases) { + int expectedByteSize = (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; + assertEquals(HyperLogLogUtils.byteSize(new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)), expectedByteSize); + } + } +} From ee0fa5ac6a65e168a2ebe14a2c442dda954278e2 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:54:28 -0400 Subject: [PATCH 10/21] spotless apply --- .../v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java | 2 -- .../local/aggregator/DistinctCountHLLValueAggregator.java | 3 --- .../segment/local/aggregator/SumPrecisionValueAggregator.java | 2 -- .../segment/local/segment/index/forward/ForwardIndexType.java | 2 -- 4 files changed, 9 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java index 398b5a816920..b9a05a62e528 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java @@ -20,9 +20,7 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog; import java.util.ArrayList; -import java.util.List; import java.util.Random; -import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.core.common.ObjectSerDeUtils; import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index 3248b0ff088b..d30c5511d8d6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -20,10 +20,7 @@ import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; import com.clearspring.analytics.stream.cardinality.HyperLogLog; -import com.google.common.base.Preconditions; -import java.io.IOException; import java.util.List; -import org.apache.commons.lang3.StringUtils; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.local.utils.CustomSerDeUtils; import org.apache.pinot.segment.spi.AggregationFunctionType; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java index 5b6337baf39e..6e7262611750 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java @@ -18,10 +18,8 @@ */ package org.apache.pinot.segment.local.aggregator; -import com.google.common.base.Preconditions; import java.math.BigDecimal; import java.util.List; -import org.apache.commons.lang3.StringUtils; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index 60c5e2dead65..aa00538250d6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -58,8 +58,6 @@ import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.Schema; -import static org.apache.pinot.spi.data.FieldSpec.DataType.BIG_DECIMAL; -import static org.apache.pinot.spi.data.FieldSpec.DataType.BYTES; public class ForwardIndexType From 38160a405e16c0ce75ccf9e564dfea27810090e6 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 17:57:18 -0400 Subject: [PATCH 11/21] lint fix --- .../local/indexsegment/mutable/MutableSegmentImpl.java | 9 +++++++-- .../impl/forward/FixedByteSVMutableForwardIndex.java | 6 +++++- .../org/apache/pinot/spi/utils/HyperLogLogUtils.java | 2 +- .../org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java | 5 ++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index 9b0e2b7b76ab..f9e99ed9f57f 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -268,8 +268,13 @@ public boolean isMutableSegment() { } // We consider fields whose values have a fixed size to be fixed width fields. FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); - boolean isFieldFixed = (storedType.isFixedWidth() || ( - (storedType.getStoredType() == BYTES || storedType.getStoredType() == BIG_DECIMAL) && fixedByteSize > 0 && consumingAggregatedMetric) + boolean isFieldFixed = ( + storedType.isFixedWidth() + || ( + (storedType.getStoredType() == BYTES + || storedType.getStoredType() == BIG_DECIMAL + ) && fixedByteSize > 0 && consumingAggregatedMetric + ) ); FieldIndexConfigs indexConfigs = Optional.ofNullable(config.getIndexConfigByCol().get(column)) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java index 8e0982e0497b..9a42deb60332 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java @@ -215,7 +215,11 @@ public byte[] getBytes(int docId) { @Override public void setBytes(int docId, byte[] value) { - Preconditions.checkArgument(value.length == _valueSizeInBytes, "Expected value size to be: %s but got: %s ", _valueSizeInBytes, value.length); + Preconditions.checkArgument( + value.length == _valueSizeInBytes, + "Expected value size to be: %s but got: %s ", + _valueSizeInBytes, value.length + ); addBufferIfNeeded(docId); getWriterForRow(docId).setBytes(docId, value); diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java index bde35e2cbff9..9197cc8e3237 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java @@ -38,4 +38,4 @@ public static int byteSize(HyperLogLog value) { public static int byteSize(int log2m) { return (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; } -} \ No newline at end of file +} diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java index 6dfc1a88ed77..2001aa7f49ab 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java @@ -39,7 +39,10 @@ public void testByteSizeWithHLLObject() { int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; for (int log2m : testCases) { int expectedByteSize = (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; - assertEquals(HyperLogLogUtils.byteSize(new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)), expectedByteSize); + assertEquals( + HyperLogLogUtils.byteSize(new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)), + expectedByteSize + ); } } } From 6c964e99d79ccc41e40ec41b1b0e0af928a7ae04 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 18:32:44 -0400 Subject: [PATCH 12/21] fix test --- .../pinot/spi/utils/HyperLogLogUtils.java | 9 +++++--- .../pinot/spi/utils/HyperLogLogUtilsTest.java | 23 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java index 9197cc8e3237..e1f37664a5e1 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java @@ -20,6 +20,8 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog; import com.clearspring.analytics.stream.cardinality.RegisterSet; +import java.io.IOException; + public class HyperLogLogUtils { private HyperLogLogUtils() { @@ -28,14 +30,15 @@ private HyperLogLogUtils() { /** * Returns the byte size of the given HyperLogLog. */ - public static int byteSize(HyperLogLog value) { - return value.sizeof(); + public static int byteSize(HyperLogLog value) + throws IOException { + return value.getBytes().length; } /** * Returns the byte size of hyperloglog of a given log2m. */ public static int byteSize(int log2m) { - return (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; + return ((RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES); } } diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java index 2001aa7f49ab..030b84d427d3 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java @@ -18,30 +18,33 @@ */ package org.apache.pinot.spi.utils; -import com.clearspring.analytics.stream.cardinality.RegisterSet; +import com.clearspring.analytics.stream.cardinality.HyperLogLog; +import java.io.IOException; import org.testng.annotations.Test; - import static org.testng.Assert.assertEquals; public class HyperLogLogUtilsTest { @Test - public void testByteSizeLog2M() { - int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + public void testByteSizeLog2M() + throws IOException { + int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; for (int log2m : testCases) { - int expectedByteSize = (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; - assertEquals(HyperLogLogUtils.byteSize(log2m), expectedByteSize); + assertEquals( + HyperLogLogUtils.byteSize(log2m), + (new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)).getBytes().length + ); } } @Test - public void testByteSizeWithHLLObject() { - int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + public void testByteSizeWithHLLObject() + throws IOException { + int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; for (int log2m : testCases) { - int expectedByteSize = (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; assertEquals( HyperLogLogUtils.byteSize(new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)), - expectedByteSize + (new HyperLogLog(log2m)).getBytes().length ); } } From 48053424e1c0729be37a106838463d7a8d305efa Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Wed, 26 Jul 2023 18:36:59 -0400 Subject: [PATCH 13/21] more linting --- .../java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java index 030b84d427d3..711b037e43cc 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java +++ b/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java @@ -21,6 +21,7 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog; import java.io.IOException; import org.testng.annotations.Test; + import static org.testng.Assert.assertEquals; public class HyperLogLogUtilsTest { From 02df60ca15cc8d78bf4f5862aef40d32ab954089 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 11:17:15 -0400 Subject: [PATCH 14/21] canonical names, and hll doesnt need max length set --- .../mutable/MutableSegmentImpl.java | 2 +- .../segment/local/utils/TableConfigUtils.java | 41 ++++--------- ...leSegmentImplIngestionAggregationTest.java | 8 +-- .../local/utils/TableConfigUtilsTest.java | 59 +++++++++++-------- 4 files changed, 51 insertions(+), 59 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index f9e99ed9f57f..51205c499913 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -585,10 +585,10 @@ private void updateDictionary(GenericRow row) { for (Map.Entry entry : _indexContainerMap.entrySet()) { IndexContainer indexContainer = entry.getValue(); MutableDictionary dictionary = indexContainer._dictionary; - if (dictionary == null) { continue; } + Object value = row.getValue(entry.getKey()); if (value == null) { recordIndexingError("DICTIONARY"); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index 723f17a76197..8139ec578c1d 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -18,12 +18,10 @@ */ package org.apache.pinot.segment.local.utils; -import com.clearspring.analytics.stream.cardinality.HyperLogLog; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableSet; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -385,51 +383,38 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc "aggregation function must be a function for: %s", aggregationConfig); FunctionContext functionContext = expressionContext.getFunction(); + AggregationFunctionType inputFunctionType = + AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); validateIngestionAggregation(functionContext.getFunctionName()); List arguments = functionContext.getArguments(); - if ((DISTINCTCOUNTHLL.name().toLowerCase().equals(functionContext.getFunctionName())) - || (SUMPRECISION.name().toLowerCase().equals(functionContext.getFunctionName()))) { + if (inputFunctionType.equals(DISTINCTCOUNTHLL) || inputFunctionType + .equals(SUMPRECISION)) { String destinationColumn = aggregationConfig.getColumnName(); FieldSpec destinationFieldSpec = schema.getFieldSpecFor(destinationColumn); if (destinationFieldSpec == null) { throw new RuntimeException("couldn't find field config for " + destinationColumn - + ". Unable to validate aggregation config for " + functionContext.getFunctionName()); + + ". Unable to validate aggregation config for " + inputFunctionType); } int maxLength = destinationFieldSpec.getMaxLength(); - if (DISTINCTCOUNTHLL.name().toLowerCase().equals(functionContext.getFunctionName())) { + if (inputFunctionType.equals(DISTINCTCOUNTHLL)) { Preconditions.checkState( functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, - "distinctcounthll function can have max two arguments: %s", aggregationConfig); + "%s can have max two arguments: %s", inputFunctionType, aggregationConfig); - int log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; if (functionContext.getArguments().size() == 2) { Preconditions.checkState( StringUtils.isNumeric(functionContext.getArguments().get(1).getLiteral().getStringValue() ), - "distinctcounthll function second argument must be a number"); - - log2m = Integer.parseInt(functionContext.getArguments().get(1).getLiteral().getStringValue()); + "%s function second argument must be a number", inputFunctionType); } - - int expectedBytesForHLL; - try { - expectedBytesForHLL = (new HyperLogLog(log2m)).getBytes().length; - } catch (IOException e) { - throw new RuntimeException(e); - } - - Preconditions.checkState(maxLength == expectedBytesForHLL, - "destination field for distinctcounthll must have maxLength property set to " - + expectedBytesForHLL - + ", the size of a HLL object with log2m of " + log2m); - } else if (SUMPRECISION.name().toLowerCase().equals(functionContext.getFunctionName())) { + } else if (inputFunctionType.equals(SUMPRECISION)) { Preconditions.checkState(functionContext.getArguments().size() >= 2 && functionContext.getArguments().size() <= 3, - "sumprecision must specify precision (required), scale (optional)", + SUMPRECISION + " must specify precision (required), scale (optional)", aggregationConfig); } } else { @@ -441,12 +426,10 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, "aggregator function argument must be an identifier: %s", aggregationConfig); - AggregationFunctionType functionType = - AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); - ValueAggregator valueAggregator = ValueAggregatorFactory.getValueAggregator(functionType, arguments); + ValueAggregator valueAggregator = ValueAggregatorFactory.getValueAggregator(inputFunctionType, arguments); if (schema != null && fieldSpec != null) { - if (functionType.equals(SUMPRECISION)) { + if (inputFunctionType.equals(SUMPRECISION)) { Preconditions.checkState(fieldSpec.getDataType() == DataType.BIG_DECIMAL, "SUMPRECISION consuming aggregation requires the schema data type to be BIG_DECIMAL"); } else { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java index 607174f7d88a..06a47e5af0bf 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java @@ -188,9 +188,6 @@ public void testValuesAreNull() public void testDISTINCTCOUNTHLL() throws Exception { String m1 = "metric_DISTINCTCOUNTHLL"; Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BYTES).build(); - // Size of hll object for when log2m is 12 - schema.getFieldSpecFor(m1).setMaxLength(2740); - MutableSegmentImpl mutableSegmentImpl = MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), VAR_LENGTH_SET, INVERTED_INDEX_SET, @@ -213,9 +210,8 @@ public void testDISTINCTCOUNTHLL() throws Exception { /* Assert that the distinct count is within an error margin. We assert on the cardinality of the HLL in the docID - and the - HLL we made, but also on the cardinality of the HLL in the docID and the actual cardinality from the set of - integers. + and the HLL we made, but also on the cardinality of the HLL in the docID and the actual cardinality from + the set of integers. */ GenericRow reuse = new GenericRow(); for (int docId = 0; docId < expected.size(); docId++) { diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java index a31e4a0cc1ca..4949646ab47b 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java @@ -495,8 +495,6 @@ public void ingestionAggregationConfigsTest() { } schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME).addMetric("d1", FieldSpec.DataType.BYTES).build(); - schema.getFieldSpecFor("d1").setMaxLength(180); - // distinctcounthllmv is not supported, we expect this to not validate List aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLLMV(s1)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); @@ -511,31 +509,27 @@ public void ingestionAggregationConfigsTest() { // expected } -// test the distinctcounthll validation for various log2m sizes - HashMap log2mToExpectedSize = new HashMap<>(); - log2mToExpectedSize.put(8, 180); - log2mToExpectedSize.put(12, 2740); - - for (Map.Entry entry : log2mToExpectedSize.entrySet()) { - // set the max length to the HLL expected bytes size - schema.getFieldSpecFor("d1").setMaxLength(entry.getValue()); + // distinctcounthll, expect that the function name in various forms (with and without underscores) still validates + aggregationConfigs = Arrays.asList( + new AggregationConfig("d1", "distinct_count_hll(s1)"), + new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)"), + new AggregationConfig("d1", "distinctcounthll(s1)"), + new AggregationConfig("d1", "DISTINCTCOUNT_HLL(s1)"), + new AggregationConfig("d1", "DISTINCT_COUNT_HLL(s1)") + ); - aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1, " + entry.getKey() + ")")); - ingestionConfig.setAggregationConfigs(aggregationConfigs); - tableConfig = - new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") - .setIngestionConfig(ingestionConfig).build(); + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); - try { - TableConfigUtils.validateIngestionConfig(tableConfig, schema); - } catch (IllegalStateException e) { - Assert.fail( - "The HLL object size based on the log2m doesn't match the destination BYTES field's maxLength property"); - } + try { + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + } catch (IllegalStateException e) { + Assert.fail("Should not fail due to valid aggregation function"); } // distinctcounthll, expect not specified log2m argument to default to 8 - schema.getFieldSpecFor("d1").setMaxLength(180); aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); tableConfig = @@ -545,7 +539,7 @@ public void ingestionAggregationConfigsTest() { try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); } catch (IllegalStateException e) { - Assert.fail("Log2m defaulted to 8 but BYTES schema maxLength is not the expected size of 180"); + Assert.fail("Log2m should have defaulted to 8"); } aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "s1 + s2")); @@ -560,6 +554,25 @@ public void ingestionAggregationConfigsTest() { } catch (IllegalStateException e) { // expected } + + // sumprecision, expect that the function name in various forms (with and without underscores) still validates + aggregationConfigs = Arrays.asList( + new AggregationConfig("d1", "sum_precision(s1, 10, 32)"), + new AggregationConfig("d1", "SUM_PRECISION(s1), 1"), + new AggregationConfig("d1", "sumprecision(s1, 2)"), + new AggregationConfig("d1", "SUMPRECISION(s1),10") + ); + + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); + + try { + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + } catch (IllegalStateException e) { + Assert.fail("Should not fail due to valid aggregation function"); + } } @Test From 69dfcc5c5ce7cf462480095717c430a89fa6b5a1 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 11:48:04 -0400 Subject: [PATCH 15/21] more tests for sum precision and make hll not rely on setting max length on schema --- .../local/utils/TableConfigUtilsTest.java | 73 ++++++++++++++++--- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java index 4949646ab47b..e53c61e94459 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java @@ -510,12 +510,22 @@ public void ingestionAggregationConfigsTest() { } // distinctcounthll, expect that the function name in various forms (with and without underscores) still validates + schema = new Schema + .SchemaBuilder() + .setSchemaName(TABLE_NAME) + .addMetric("d1", FieldSpec.DataType.BYTES) + .addMetric("d2", FieldSpec.DataType.BYTES) + .addMetric("d3", FieldSpec.DataType.BYTES) + .addMetric("d4", FieldSpec.DataType.BYTES) + .addMetric("d5", FieldSpec.DataType.BYTES) + .build(); + aggregationConfigs = Arrays.asList( new AggregationConfig("d1", "distinct_count_hll(s1)"), - new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)"), - new AggregationConfig("d1", "distinctcounthll(s1)"), - new AggregationConfig("d1", "DISTINCTCOUNT_HLL(s1)"), - new AggregationConfig("d1", "DISTINCT_COUNT_HLL(s1)") + new AggregationConfig("d2", "DISTINCTCOUNTHLL(s1)"), + new AggregationConfig("d3", "distinctcounthll(s1)"), + new AggregationConfig("d4", "DISTINCTCOUNT_HLL(s1)"), + new AggregationConfig("d5", "DISTINCT_COUNT_HLL(s1)") ); ingestionConfig.setAggregationConfigs(aggregationConfigs); @@ -526,10 +536,16 @@ public void ingestionAggregationConfigsTest() { try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); } catch (IllegalStateException e) { - Assert.fail("Should not fail due to valid aggregation function"); + Assert.fail("Should not fail due to valid aggregation function", e); } // distinctcounthll, expect not specified log2m argument to default to 8 + schema = new Schema + .SchemaBuilder() + .setSchemaName(TABLE_NAME) + .addMetric("d1", FieldSpec.DataType.BYTES) + .build(); + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); tableConfig = @@ -539,7 +555,7 @@ public void ingestionAggregationConfigsTest() { try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); } catch (IllegalStateException e) { - Assert.fail("Log2m should have defaulted to 8"); + Assert.fail("Log2m should have defaulted to 8", e); } aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "s1 + s2")); @@ -551,16 +567,49 @@ public void ingestionAggregationConfigsTest() { try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); Assert.fail("Should fail due to multiple arguments"); - } catch (IllegalStateException e) { + } catch (IllegalArgumentException e) { // expected } // sumprecision, expect that the function name in various forms (with and without underscores) still validates + schema = new Schema + .SchemaBuilder() + .setSchemaName(TABLE_NAME) + .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d2", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d3", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d4", FieldSpec.DataType.BIG_DECIMAL) + .build(); + aggregationConfigs = Arrays.asList( new AggregationConfig("d1", "sum_precision(s1, 10, 32)"), - new AggregationConfig("d1", "SUM_PRECISION(s1), 1"), - new AggregationConfig("d1", "sumprecision(s1, 2)"), - new AggregationConfig("d1", "SUMPRECISION(s1),10") + new AggregationConfig("d2", "SUM_PRECISION(s1, 1)"), + new AggregationConfig("d3", "sumprecision(s1, 2)"), + new AggregationConfig("d4", "SUMPRECISION(s1, 10, 99)") + ); + + ingestionConfig.setAggregationConfigs(aggregationConfigs); + tableConfig = + new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") + .setIngestionConfig(ingestionConfig).build(); + + try { + TableConfigUtils.validateIngestionConfig(tableConfig, schema); + } catch (IllegalStateException e) { + Assert.fail("Should not fail due to valid aggregation function", e); + } + + // with too many arguments should fail + schema = new Schema + .SchemaBuilder() + .setSchemaName(TABLE_NAME) + .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) + .build(); + + aggregationConfigs = Arrays.asList( + new AggregationConfig("d1", "sum_precision(s1, 10, 32, 99)") ); ingestionConfig.setAggregationConfigs(aggregationConfigs); @@ -570,8 +619,10 @@ public void ingestionAggregationConfigsTest() { try { TableConfigUtils.validateIngestionConfig(tableConfig, schema); + Assert.fail("Should have failed with too many arguments but didn't"); } catch (IllegalStateException e) { - Assert.fail("Should not fail due to valid aggregation function"); + Assert.assertTrue(e.getMessage().contains("SUMPRECISION must specify precision (required), scale (optional)" + + " [{\"columnName\":\"d1\",\"aggregationFunction\":\"sum_precision(s1, 10, 32, 99)\"}]")); } } From 5ff143405afd87298a7a7cd40edb71fde288b7a3 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 11:56:21 -0400 Subject: [PATCH 16/21] add test that assert null values cause throw --- ...leSegmentImplIngestionAggregationTest.java | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java index 06a47e5af0bf..60932af9f91f 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java @@ -140,7 +140,7 @@ public void testSameAggregationDifferentSrc() @Test - public void testValuesAreNull() + public void testValuesAreNullThrowsException() throws Exception { String m1 = "sum1"; @@ -158,27 +158,15 @@ public void testValuesAreNull() Random random = new Random(seed); StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); - for (int i = 0; i < NUM_ROWS; i++) { - // Generate random int to prevent overflow - GenericRow row = getRow(random, 1); - row.putValue(METRIC, null); - mutableSegmentImpl.index(row, defaultMetadata); - - String key = buildKey(row); - keys.add(key); - } - - int numDocsIndexed = mutableSegmentImpl.getNumDocsIndexed(); - Assert.assertEquals(numDocsIndexed, keys.size()); - - // Assert that aggregation happened. - Assert.assertTrue(numDocsIndexed < NUM_ROWS); - - GenericRow reuse = new GenericRow(); - for (int docId = 0; docId < keys.size(); docId++) { - GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); - String key = buildKey(row); - Assert.assertEquals(row.getValue(m1), 0, key); + // Generate random int to prevent overflow + GenericRow row = getRow(random, 1); + row.putValue(METRIC, null); + try { +mutableSegmentImpl.index(row, defaultMetadata); + Assert.fail(); + } catch (NullPointerException e) { + Assert.assertTrue(e.getMessage().contains("Cannot invoke \"java.lang.Number.doubleValue()\" because" + + " \"rawValue\" is null")); } mutableSegmentImpl.destroy(); From b77a505a1c6f7891fe52983308b23179b4720aca Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 12:12:19 -0400 Subject: [PATCH 17/21] pinot-spi-jdk8 pom updated --- .../pinot-segment-spi-jdk8/pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml b/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml index 732b9017fca0..a7253f7dca20 100644 --- a/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml +++ b/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml @@ -72,6 +72,11 @@ org.apache.calcite calcite-core + + com.clearspring.analytics + stream + 2.7.0 + From 4cb2a963c60b89bcd86a1f5d5b0c0ec82e1b19d4 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 12:15:23 -0400 Subject: [PATCH 18/21] fix wrong pom addition --- .../pinot-segment-spi-jdk8/pom.xml | 5 ----- .../prestodb-pinot-dependencies/pinot-spi-jdk8/pom.xml | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml b/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml index a7253f7dca20..732b9017fca0 100644 --- a/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml +++ b/pinot-connectors/prestodb-pinot-dependencies/pinot-segment-spi-jdk8/pom.xml @@ -72,11 +72,6 @@ org.apache.calcite calcite-core - - com.clearspring.analytics - stream - 2.7.0 - diff --git a/pinot-connectors/prestodb-pinot-dependencies/pinot-spi-jdk8/pom.xml b/pinot-connectors/prestodb-pinot-dependencies/pinot-spi-jdk8/pom.xml index 6ac4e5514abb..5d29caec14b5 100644 --- a/pinot-connectors/prestodb-pinot-dependencies/pinot-spi-jdk8/pom.xml +++ b/pinot-connectors/prestodb-pinot-dependencies/pinot-spi-jdk8/pom.xml @@ -129,5 +129,10 @@ org.reflections reflections + + com.clearspring.analytics + stream + 2.7.0 + From e9cbedcb9a30d61a0f65388c8ae6a9d6fc72d1c4 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 12:48:35 -0400 Subject: [PATCH 19/21] fix assertion --- .../mutable/MutableSegmentImplIngestionAggregationTest.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java index 60932af9f91f..53e25f47017a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java @@ -151,9 +151,6 @@ public void testValuesAreNullThrowsException() VAR_LENGTH_SET, INVERTED_INDEX_SET, Arrays.asList(new AggregationConfig(m1, "SUM(metric)"))); - - Set keys = new HashSet<>(); - long seed = 2; Random random = new Random(seed); StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); @@ -165,8 +162,7 @@ public void testValuesAreNullThrowsException() mutableSegmentImpl.index(row, defaultMetadata); Assert.fail(); } catch (NullPointerException e) { - Assert.assertTrue(e.getMessage().contains("Cannot invoke \"java.lang.Number.doubleValue()\" because" - + " \"rawValue\" is null")); + // expected } mutableSegmentImpl.destroy(); From 3d29fbd846c68487410f8fbcd22d32516b207654 Mon Sep 17 00:00:00 2001 From: Priyen Patel Date: Thu, 27 Jul 2023 16:45:55 -0400 Subject: [PATCH 20/21] dont accidentally spawn a var type index --- .../indexsegment/mutable/MutableSegmentImpl.java | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index 51205c499913..3eecbedfb912 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -268,14 +268,9 @@ public boolean isMutableSegment() { } // We consider fields whose values have a fixed size to be fixed width fields. FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); - boolean isFieldFixed = ( - storedType.isFixedWidth() - || ( - (storedType.getStoredType() == BYTES - || storedType.getStoredType() == BIG_DECIMAL - ) && fixedByteSize > 0 && consumingAggregatedMetric - ) - ); + if (storedType.isFixedWidth()) { + fixedByteSize = storedType.size(); + } FieldIndexConfigs indexConfigs = Optional.ofNullable(config.getIndexConfigByCol().get(column)) .orElse(FieldIndexConfigs.EMPTY); @@ -287,7 +282,7 @@ public boolean isMutableSegment() { .withEstimatedColSize(_statsHistory.getEstimatedAvgColSize(column)) .withAvgNumMultiValues(_statsHistory.getEstimatedAvgColSize(column)) .withConsumerDir(config.getConsumerDir() != null ? new File(config.getConsumerDir()) : null) - .withFixedLengthBytes(isFieldFixed ? fixedByteSize : -1) + .withFixedLengthBytes(fixedByteSize) .build(); // Partition info From 5e75cf1435a5d993e7e43dcd04628b81b804c782 Mon Sep 17 00:00:00 2001 From: "Xiaotian (Jackie) Jiang" Date: Fri, 28 Jul 2023 00:16:49 -0700 Subject: [PATCH 21/21] Some fixes and cleanup - Avoid HLL dependency in pinot-spi - Simplify byte size computation for HLL - Fix the table config validation logic. We allow type mismatch as long as it can be converted (e.g. numbers are compatible) - Cleanup and reformat --- .../v2/DistinctCountHLLStarTreeV2Test.java | 4 +- ...regatedDistinctCountHLLStarTreeV2Test.java | 4 +- .../v2/SumPrecisionStarTreeV2Test.java | 4 +- .../DistinctCountHLLValueAggregator.java | 25 ++-- .../SumPrecisionValueAggregator.java | 45 ++++---- .../mutable/MutableSegmentImpl.java | 59 +++++----- .../FixedByteSVMutableForwardIndex.java | 13 +-- .../index/forward/ForwardIndexType.java | 11 +- .../v2/builder/BaseSingleTreeBuilder.java | 3 +- .../local}/utils/HyperLogLogUtils.java | 14 +-- .../segment/local/utils/TableConfigUtils.java | 109 +++++++----------- ...leSegmentImplIngestionAggregationTest.java | 85 ++++++-------- .../FixedByteSVMutableForwardIndexTest.java | 6 +- .../local}/utils/HyperLogLogUtilsTest.java | 28 ++--- .../local/utils/TableConfigUtilsTest.java | 71 +++--------- pinot-spi/pom.xml | 5 - .../org/apache/pinot/spi/data/FieldSpec.java | 2 +- 17 files changed, 194 insertions(+), 294 deletions(-) rename {pinot-spi/src/main/java/org/apache/pinot/spi => pinot-segment-local/src/main/java/org/apache/pinot/segment/local}/utils/HyperLogLogUtils.java (73%) rename {pinot-spi/src/test/java/org/apache/pinot/spi => pinot-segment-local/src/test/java/org/apache/pinot/segment/local}/utils/HyperLogLogUtilsTest.java (60%) diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java index 32ad90434556..9bc103ffd247 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/DistinctCountHLLStarTreeV2Test.java @@ -19,7 +19,7 @@ package org.apache.pinot.core.startree.v2; import com.clearspring.analytics.stream.cardinality.HyperLogLog; -import java.util.ArrayList; +import java.util.Collections; import java.util.Random; import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; @@ -32,7 +32,7 @@ public class DistinctCountHLLStarTreeV2Test extends BaseStarTreeV2Test getValueAggregator() { - return new DistinctCountHLLValueAggregator(new ArrayList<>()); + return new DistinctCountHLLValueAggregator(Collections.emptyList()); } @Override diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java index b9a05a62e528..531348c77157 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/PreAggregatedDistinctCountHLLStarTreeV2Test.java @@ -19,7 +19,7 @@ package org.apache.pinot.core.startree.v2; import com.clearspring.analytics.stream.cardinality.HyperLogLog; -import java.util.ArrayList; +import java.util.Collections; import java.util.Random; import org.apache.pinot.core.common.ObjectSerDeUtils; import org.apache.pinot.segment.local.aggregator.DistinctCountHLLValueAggregator; @@ -35,7 +35,7 @@ public class PreAggregatedDistinctCountHLLStarTreeV2Test extends BaseStarTreeV2T @Override ValueAggregator getValueAggregator() { - return new DistinctCountHLLValueAggregator(new ArrayList<>()); + return new DistinctCountHLLValueAggregator(Collections.emptyList()); } @Override diff --git a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java index 06390faafeb0..87c74aa59152 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/startree/v2/SumPrecisionStarTreeV2Test.java @@ -19,7 +19,7 @@ package org.apache.pinot.core.startree.v2; import java.math.BigDecimal; -import java.util.ArrayList; +import java.util.Collections; import java.util.Random; import org.apache.pinot.segment.local.aggregator.SumPrecisionValueAggregator; import org.apache.pinot.segment.local.aggregator.ValueAggregator; @@ -32,7 +32,7 @@ public class SumPrecisionStarTreeV2Test extends BaseStarTreeV2Test getValueAggregator() { - return new SumPrecisionValueAggregator(new ArrayList<>()); + return new SumPrecisionValueAggregator(Collections.emptyList()); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java index d30c5511d8d6..ada9d5a170cc 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/DistinctCountHLLValueAggregator.java @@ -23,31 +23,26 @@ import java.util.List; import org.apache.pinot.common.request.context.ExpressionContext; import org.apache.pinot.segment.local.utils.CustomSerDeUtils; +import org.apache.pinot.segment.local.utils.HyperLogLogUtils; import org.apache.pinot.segment.spi.AggregationFunctionType; import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.utils.CommonConstants; -import org.apache.pinot.spi.utils.HyperLogLogUtils; public class DistinctCountHLLValueAggregator implements ValueAggregator { public static final DataType AGGREGATED_VALUE_TYPE = DataType.BYTES; - private int _log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; - private int _log2mByteSize = 180; + + private final int _log2m; + // Byte size won't change once we get the initial aggregated value private int _maxByteSize; public DistinctCountHLLValueAggregator(List arguments) { // length 1 means we use the default _log2m of 8 if (arguments.size() <= 1) { - return; - } - - try { + _log2m = CommonConstants.Helix.DEFAULT_HYPERLOGLOG_LOG2M; + } else { _log2m = arguments.get(1).getLiteral().getIntValue(); - _log2mByteSize = HyperLogLogUtils.byteSize(_log2m); - _maxByteSize = _log2mByteSize; - } catch (Exception e) { - throw new RuntimeException(e); } } @@ -67,11 +62,11 @@ public HyperLogLog getInitialAggregatedValue(Object rawValue) { if (rawValue instanceof byte[]) { byte[] bytes = (byte[]) rawValue; initialValue = deserializeAggregatedValue(bytes); - _maxByteSize = Math.max(_maxByteSize, bytes.length); + _maxByteSize = bytes.length; } else { initialValue = new HyperLogLog(_log2m); initialValue.offer(rawValue); - _maxByteSize = Math.max(_maxByteSize, _log2mByteSize); + _maxByteSize = HyperLogLogUtils.byteSize(initialValue); } return initialValue; } @@ -107,7 +102,9 @@ public HyperLogLog cloneAggregatedValue(HyperLogLog value) { @Override public int getMaxAggregatedValueByteSize() { - return _maxByteSize; + // NOTE: For aggregated metrics, initial aggregated value might have not been generated. Returns the byte size + // based on log2m. + return _maxByteSize > 0 ? _maxByteSize : HyperLogLogUtils.byteSize(_log2m); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java index 6e7262611750..0257057e24f2 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/aggregator/SumPrecisionValueAggregator.java @@ -18,6 +18,7 @@ */ package org.apache.pinot.segment.local.aggregator; +import com.google.common.base.Preconditions; import java.math.BigDecimal; import java.util.List; import org.apache.pinot.common.request.context.ExpressionContext; @@ -29,23 +30,21 @@ public class SumPrecisionValueAggregator implements ValueAggregator { public static final DataType AGGREGATED_VALUE_TYPE = DataType.BYTES; + private final int _fixedSize; + private int _maxByteSize; - private int _fixedSize = -1; - - /* - Aggregate with an optimal maximum precision in mind. Scale is always only 1 32-bit - int and the storing of the scale value does not affect the size of the big decimal. - Given this, we won't care about scale in terms of the aggregations. - During query time, the optional scale parameter can be provided, but during aggregation, - we don't limit it. + + /** + * Optional second argument is the maximum precision. Scale is always stored as 2 bytes. During query time, the + * optional scale parameter can be provided, but during ingestion, we don't limit it. */ public SumPrecisionValueAggregator(List arguments) { // length 1 means we don't have any caps on maximum precision nor do we have a fixed size then if (arguments.size() <= 1) { - return; + _fixedSize = -1; + } else { + _fixedSize = BigDecimalUtils.byteSizeForFixedPrecision(arguments.get(1).getLiteral().getIntValue()); } - - _fixedSize = BigDecimalUtils.byteSizeForFixedPrecision(arguments.get(1).getLiteral().getIntValue()); } @Override @@ -61,14 +60,18 @@ public DataType getAggregatedValueType() { @Override public BigDecimal getInitialAggregatedValue(Object rawValue) { BigDecimal initialValue = toBigDecimal(rawValue); - _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(initialValue)); + if (_fixedSize < 0) { + _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(initialValue)); + } return initialValue; } @Override public BigDecimal applyRawValue(BigDecimal value, Object rawValue) { value = value.add(toBigDecimal(rawValue)); - _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(value)); + if (_fixedSize < 0) { + _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(value)); + } return value; } @@ -85,7 +88,9 @@ private static BigDecimal toBigDecimal(Object rawValue) { @Override public BigDecimal applyAggregatedValue(BigDecimal value, BigDecimal aggregatedValue) { value = value.add(aggregatedValue); - _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(value)); + if (_fixedSize < 0) { + _maxByteSize = Math.max(_maxByteSize, BigDecimalUtils.byteSize(value)); + } return value; } @@ -97,18 +102,14 @@ public BigDecimal cloneAggregatedValue(BigDecimal value) { @Override public int getMaxAggregatedValueByteSize() { - if (_fixedSize > 0) { - return _fixedSize; - } - return _maxByteSize; + Preconditions.checkState(_fixedSize > 0 || _maxByteSize > 0, + "Unknown max aggregated value byte size, please provide maximum precision as the second argument"); + return _fixedSize > 0 ? _fixedSize : _maxByteSize; } @Override public byte[] serializeAggregatedValue(BigDecimal value) { - if (_fixedSize > 0) { - return BigDecimalUtils.serializeWithSize(value, _fixedSize); - } - return BigDecimalUtils.serialize(value); + return _fixedSize > 0 ? BigDecimalUtils.serializeWithSize(value, _fixedSize) : BigDecimalUtils.serialize(value); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java index 3eecbedfb912..f798b62783f9 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImpl.java @@ -111,7 +111,6 @@ import org.slf4j.LoggerFactory; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.pinot.spi.data.FieldSpec.DataType.BIG_DECIMAL; import static org.apache.pinot.spi.data.FieldSpec.DataType.BYTES; import static org.apache.pinot.spi.data.FieldSpec.DataType.STRING; @@ -252,28 +251,27 @@ public boolean isMutableSegment() { metricsAggregators = getMetricsAggregators(config); } - Set specialIndexes = Sets.newHashSet( - StandardIndexes.dictionary(), // dictionaries implement other contract - StandardIndexes.nullValueVector()); // null value vector implement other contract + Set specialIndexes = + Sets.newHashSet(StandardIndexes.dictionary(), // dictionaries implement other contract + StandardIndexes.nullValueVector()); // null value vector implement other contract // Initialize for each column for (FieldSpec fieldSpec : _physicalFieldSpecs) { String column = fieldSpec.getName(); int fixedByteSize = -1; - boolean consumingAggregatedMetric = false; - if (metricsAggregators.containsKey(column)) { - fixedByteSize = metricsAggregators.get(column).getRight().getMaxAggregatedValueByteSize(); - consumingAggregatedMetric = true; - } - // We consider fields whose values have a fixed size to be fixed width fields. - FieldSpec.DataType storedType = fieldSpec.getDataType().getStoredType(); - if (storedType.isFixedWidth()) { - fixedByteSize = storedType.size(); + DataType dataType = fieldSpec.getDataType(); + DataType storedType = dataType.getStoredType(); + if (!storedType.isFixedWidth()) { + // For aggregated metrics, we need to store values with fixed byte size so that in-place replacement is possible + Pair aggregatorPair = metricsAggregators.get(column); + if (aggregatorPair != null) { + fixedByteSize = aggregatorPair.getRight().getMaxAggregatedValueByteSize(); + } } - FieldIndexConfigs indexConfigs = Optional.ofNullable(config.getIndexConfigByCol().get(column)) - .orElse(FieldIndexConfigs.EMPTY); + FieldIndexConfigs indexConfigs = + Optional.ofNullable(config.getIndexConfigByCol().get(column)).orElse(FieldIndexConfigs.EMPTY); boolean isDictionary = !isNoDictionaryColumn(indexConfigs, fieldSpec, column); MutableIndexContext context = MutableIndexContext.builder().withFieldSpec(fieldSpec).withMemoryManager(_memoryManager) @@ -282,8 +280,7 @@ public boolean isMutableSegment() { .withEstimatedColSize(_statsHistory.getEstimatedAvgColSize(column)) .withAvgNumMultiValues(_statsHistory.getEstimatedAvgColSize(column)) .withConsumerDir(config.getConsumerDir() != null ? new File(config.getConsumerDir()) : null) - .withFixedLengthBytes(fixedByteSize) - .build(); + .withFixedLengthBytes(fixedByteSize).build(); // Partition info PartitionFunction partitionFunction = null; @@ -321,8 +318,7 @@ public boolean isMutableSegment() { dictionary = null; if (!fieldSpec.isSingleValueField()) { // Raw MV columns - DataType dataType = fieldSpec.getDataType().getStoredType(); - switch (dataType) { + switch (storedType) { case INT: case LONG: case FLOAT: @@ -431,15 +427,14 @@ private boolean isNoDictionaryColumn(FieldIndexConfigs indexConfigs, FieldSpec f // if the column is part of noDictionary set from table config if (fieldSpec instanceof DimensionFieldSpec && isAggregateMetricsEnabled() && (dataType == STRING || dataType == BYTES)) { - _logger.info( - "Aggregate metrics is enabled. Will create dictionary in consuming segment for column {} of type {}", + _logger.info("Aggregate metrics is enabled. Will create dictionary in consuming segment for column {} of type {}", column, dataType); return false; } // So don't create dictionary if the column (1) is member of noDictionary, and (2) is single-value or multi-value // with a fixed-width field, and (3) doesn't have an inverted index - return (fieldSpec.isSingleValueField() || fieldSpec.getDataType().isFixedWidth()) - && indexConfigs.getConfig(StandardIndexes.inverted()).isDisabled(); + return (fieldSpec.isSingleValueField() || fieldSpec.getDataType().isFixedWidth()) && indexConfigs.getConfig( + StandardIndexes.inverted()).isDisabled(); } public SegmentPartitionConfig getSegmentPartitionConfig() { @@ -813,9 +808,9 @@ private void aggregateMetrics(GenericRow row, int docId) { } break; case BYTES: - Object currentValue = valueAggregator.deserializeAggregatedValue(forwardIndex.getBytes(docId)); - Object newVal = valueAggregator.applyRawValue(currentValue, value); - forwardIndex.setBytes(docId, valueAggregator.serializeAggregatedValue(newVal)); + Object oldValue = valueAggregator.deserializeAggregatedValue(forwardIndex.getBytes(docId)); + Object newValue = valueAggregator.applyRawValue(oldValue, value); + forwardIndex.setBytes(docId, valueAggregator.serializeAggregatedValue(newValue)); break; default: throw new UnsupportedOperationException( @@ -1236,15 +1231,13 @@ private static Map> fromAggregationConfig( Preconditions.checkState(expressionContext.getType() == ExpressionContext.Type.FUNCTION, "aggregation function must be a function: %s", config); FunctionContext functionContext = expressionContext.getFunction(); - TableConfigUtils.validateIngestionAggregation(functionContext.getFunctionName()); - + AggregationFunctionType functionType = + AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); + TableConfigUtils.validateIngestionAggregation(functionType); ExpressionContext argument = functionContext.getArguments().get(0); Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, "aggregator function argument must be a identifier: %s", config); - AggregationFunctionType functionType = - AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); - columnNameToAggregator.put(config.getColumnName(), Pair.of(argument.getIdentifier(), ValueAggregatorFactory.getValueAggregator(functionType, functionContext.getArguments()))); } @@ -1310,8 +1303,8 @@ public void close() { closeable.close(); } } catch (Exception e) { - _logger.error("Caught exception while closing {} index for column: {}, continuing with error", - indexType, column, e); + _logger.error("Caught exception while closing {} index for column: {}, continuing with error", indexType, + column, e); } }; diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java index 9a42deb60332..529e80ef5ba6 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/forward/FixedByteSVMutableForwardIndex.java @@ -73,8 +73,8 @@ public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType stored int numRowsPerChunk, PinotDataBufferMemoryManager memoryManager, String allocationContext) { _dictionaryEncoded = dictionaryEncoded; _storedType = storedType; - if (storedType == DataType.BYTES || storedType == DataType.BIG_DECIMAL) { - Preconditions.checkState(fixedLength > 0, "Fixed length must be positive for BYTES and BIG_DECIMAL"); + if (!storedType.isFixedWidth()) { + Preconditions.checkState(fixedLength > 0, "Fixed length must be provided for type: %s", storedType); _valueSizeInBytes = fixedLength; } else { _valueSizeInBytes = storedType.size(); @@ -87,7 +87,7 @@ public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType stored } public FixedByteSVMutableForwardIndex(boolean dictionaryEncoded, DataType valueType, int numRowsPerChunk, - PinotDataBufferMemoryManager memoryManager, String allocationContext) { + PinotDataBufferMemoryManager memoryManager, String allocationContext) { this(dictionaryEncoded, valueType, -1, numRowsPerChunk, memoryManager, allocationContext); } @@ -215,11 +215,8 @@ public byte[] getBytes(int docId) { @Override public void setBytes(int docId, byte[] value) { - Preconditions.checkArgument( - value.length == _valueSizeInBytes, - "Expected value size to be: %s but got: %s ", - _valueSizeInBytes, value.length - ); + Preconditions.checkArgument(value.length == _valueSizeInBytes, "Expected value size to be: %s but got: %s ", + _valueSizeInBytes, value.length); addBufferIfNeeded(docId); getWriterForRow(docId).setBytes(docId, value); diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java index aa00538250d6..365d553b5886 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java @@ -59,9 +59,7 @@ import org.apache.pinot.spi.data.Schema; - -public class ForwardIndexType - extends AbstractIndexType +public class ForwardIndexType extends AbstractIndexType implements ConfigurableFromIndexLoadingConfig { public static final String INDEX_DISPLAY_NAME = "forward"; // For multi-valued column, forward-index. @@ -274,9 +272,10 @@ public MutableIndex createMutableIndex(MutableIndexContext context, ForwardIndex boolean isSingleValue = context.getFieldSpec().isSingleValueField(); if (!context.hasDictionary()) { if (isSingleValue) { - String allocationContext = IndexUtil.buildAllocationContext(context.getSegmentName(), - context.getFieldSpec().getName(), V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); - if (fixedLengthBytes > 0) { + String allocationContext = + IndexUtil.buildAllocationContext(context.getSegmentName(), context.getFieldSpec().getName(), + V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION); + if (storedType.isFixedWidth() || fixedLengthBytes > 0) { return new FixedByteSVMutableForwardIndex(false, storedType, fixedLengthBytes, context.getCapacity(), context.getMemoryManager(), allocationContext); } else { diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java index da74a873f7ca..54da52c07040 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/startree/v2/builder/BaseSingleTreeBuilder.java @@ -143,8 +143,9 @@ static class Record { for (AggregationFunctionColumnPair functionColumnPair : functionColumnPairs) { _metrics[index] = functionColumnPair.toColumnName(); _functionColumnPairs[index] = functionColumnPair; + // TODO: Allow extra arguments in star-tree (e.g. log2m, precision) _valueAggregators[index] = - ValueAggregatorFactory.getValueAggregator(functionColumnPair.getFunctionType(), Collections.EMPTY_LIST); + ValueAggregatorFactory.getValueAggregator(functionColumnPair.getFunctionType(), Collections.emptyList()); // Ignore the column for COUNT aggregation function if (_valueAggregators[index].getAggregationType() != AggregationFunctionType.COUNT) { diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/HyperLogLogUtils.java similarity index 73% rename from pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/HyperLogLogUtils.java index e1f37664a5e1..15058e9639c1 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/HyperLogLogUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/HyperLogLogUtils.java @@ -16,11 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.spi.utils; +package org.apache.pinot.segment.local.utils; import com.clearspring.analytics.stream.cardinality.HyperLogLog; import com.clearspring.analytics.stream.cardinality.RegisterSet; -import java.io.IOException; public class HyperLogLogUtils { @@ -30,15 +29,16 @@ private HyperLogLogUtils() { /** * Returns the byte size of the given HyperLogLog. */ - public static int byteSize(HyperLogLog value) - throws IOException { - return value.getBytes().length; + public static int byteSize(HyperLogLog value) { + // 8 bytes header (log2m & register set size) & register set data + return value.sizeof() + 2 * Integer.BYTES; } /** - * Returns the byte size of hyperloglog of a given log2m. + * Returns the byte size of HyperLogLog of a given log2m. */ public static int byteSize(int log2m) { - return ((RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES); + // 8 bytes header (log2m & register set size) & register set data + return (RegisterSet.getSizeForCount(1 << log2m) + 2) * Integer.BYTES; } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java index 8139ec578c1d..0557d592195a 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/TableConfigUtils.java @@ -42,8 +42,6 @@ import org.apache.pinot.common.request.context.RequestContextUtils; import org.apache.pinot.common.tier.TierFactory; import org.apache.pinot.common.utils.config.TagNameUtils; -import org.apache.pinot.segment.local.aggregator.ValueAggregator; -import org.apache.pinot.segment.local.aggregator.ValueAggregatorFactory; import org.apache.pinot.segment.local.function.FunctionEvaluator; import org.apache.pinot.segment.local.function.FunctionEvaluatorFactory; import org.apache.pinot.segment.local.segment.creator.impl.inv.BitSlicedRangeIndexCreator; @@ -354,13 +352,13 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc Set aggregationColumns = new HashSet<>(); for (AggregationConfig aggregationConfig : aggregationConfigs) { String columnName = aggregationConfig.getColumnName(); - FieldSpec fieldSpec = null; String aggregationFunction = aggregationConfig.getAggregationFunction(); if (columnName == null || aggregationFunction == null) { throw new IllegalStateException( "columnName/aggregationFunction cannot be null in AggregationConfig " + aggregationConfig); } + FieldSpec fieldSpec = null; if (schema != null) { fieldSpec = schema.getFieldSpecFor(columnName); Preconditions.checkState(fieldSpec != null, "The destination column '" + columnName @@ -383,63 +381,52 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc "aggregation function must be a function for: %s", aggregationConfig); FunctionContext functionContext = expressionContext.getFunction(); - AggregationFunctionType inputFunctionType = + AggregationFunctionType functionType = AggregationFunctionType.getAggregationFunctionType(functionContext.getFunctionName()); - validateIngestionAggregation(functionContext.getFunctionName()); + validateIngestionAggregation(functionType); List arguments = functionContext.getArguments(); - - if (inputFunctionType.equals(DISTINCTCOUNTHLL) || inputFunctionType - .equals(SUMPRECISION)) { - String destinationColumn = aggregationConfig.getColumnName(); - FieldSpec destinationFieldSpec = schema.getFieldSpecFor(destinationColumn); - - if (destinationFieldSpec == null) { - throw new RuntimeException("couldn't find field config for " + destinationColumn - + ". Unable to validate aggregation config for " + inputFunctionType); + int numArguments = arguments.size(); + if (functionType == DISTINCTCOUNTHLL) { + Preconditions.checkState(numArguments >= 1 && numArguments <= 2, + "DISTINCT_COUNT_HLL can have at most two arguments: %s", aggregationConfig); + if (numArguments == 2) { + ExpressionContext secondArgument = arguments.get(1); + Preconditions.checkState(secondArgument.getType() == ExpressionContext.Type.LITERAL, + "Second argument of DISTINCT_COUNT_HLL must be literal: %s", aggregationConfig); + String literal = secondArgument.getLiteral().getStringValue(); + Preconditions.checkState(StringUtils.isNumeric(literal), + "Second argument of DISTINCT_COUNT_HLL must be a number: %s", aggregationConfig); } - int maxLength = destinationFieldSpec.getMaxLength(); - - if (inputFunctionType.equals(DISTINCTCOUNTHLL)) { - Preconditions.checkState( - functionContext.getArguments().size() >= 1 && functionContext.getArguments().size() <= 2, - "%s can have max two arguments: %s", inputFunctionType, aggregationConfig); - - if (functionContext.getArguments().size() == 2) { - Preconditions.checkState( - StringUtils.isNumeric(functionContext.getArguments().get(1).getLiteral().getStringValue() - ), - "%s function second argument must be a number", inputFunctionType); - } - } else if (inputFunctionType.equals(SUMPRECISION)) { - Preconditions.checkState(functionContext.getArguments().size() >= 2 - && functionContext.getArguments().size() <= 3, - SUMPRECISION + " must specify precision (required), scale (optional)", - aggregationConfig); + if (fieldSpec != null) { + DataType dataType = fieldSpec.getDataType(); + Preconditions.checkState(dataType == DataType.BYTES, + "Result type for DISTINCT_COUNT_HLL must be BYTES: %s", aggregationConfig); } - } else { - Preconditions.checkState(functionContext.getArguments().size() == 1, - "aggregation function can only have one argument: %s", aggregationConfig); - } - - ExpressionContext argument = functionContext.getArguments().get(0); - Preconditions.checkState(argument.getType() == ExpressionContext.Type.IDENTIFIER, - "aggregator function argument must be an identifier: %s", aggregationConfig); - - ValueAggregator valueAggregator = ValueAggregatorFactory.getValueAggregator(inputFunctionType, arguments); - - if (schema != null && fieldSpec != null) { - if (inputFunctionType.equals(SUMPRECISION)) { - Preconditions.checkState(fieldSpec.getDataType() == DataType.BIG_DECIMAL, - "SUMPRECISION consuming aggregation requires the schema data type to be BIG_DECIMAL"); - } else { - Preconditions.checkState(valueAggregator.getAggregatedValueType() == fieldSpec.getDataType(), - "aggregator function datatype (%s) must be the same as the schema datatype (%s) for %s", - valueAggregator.getAggregatedValueType(), fieldSpec.getDataType(), fieldSpec.getName()); + } else if (functionType == SUMPRECISION) { + Preconditions.checkState(numArguments >= 2 && numArguments <= 3, + "SUM_PRECISION must specify precision (required), scale (optional): %s", aggregationConfig); + ExpressionContext secondArgument = arguments.get(1); + Preconditions.checkState(secondArgument.getType() == ExpressionContext.Type.LITERAL, + "Second argument of SUM_PRECISION must be literal: %s", aggregationConfig); + String literal = secondArgument.getLiteral().getStringValue(); + Preconditions.checkState(StringUtils.isNumeric(literal), + "Second argument of SUM_PRECISION must be a number: %s", aggregationConfig); + if (fieldSpec != null) { + DataType dataType = fieldSpec.getDataType(); + Preconditions.checkState(dataType == DataType.BIG_DECIMAL || dataType == DataType.BYTES, + "Result type for DISTINCT_COUNT_HLL must be BIG_DECIMAL or BYTES: %s", aggregationConfig); } + } else { + Preconditions.checkState(numArguments == 1, "%s can only have one argument: %s", functionType, + aggregationConfig); } + ExpressionContext firstArgument = arguments.get(0); + Preconditions.checkState(firstArgument.getType() == ExpressionContext.Type.IDENTIFIER, + "First argument of aggregation function: %s must be identifier, got: %s", functionType, + firstArgument.getType()); - aggregationSourceColumns.add(argument.getIdentifier()); + aggregationSourceColumns.add(firstArgument.getIdentifier()); } if (schema != null) { Preconditions.checkState(new HashSet<>(schema.getMetricNames()).equals(aggregationColumns), @@ -507,21 +494,9 @@ public static void validateIngestionConfig(TableConfig tableConfig, @Nullable Sc } } - /** - * Currently only, ValueAggregators with fixed width types are allowed, so MIN, MAX, SUM, and COUNT. The reason - * is that only the {@link org.apache.pinot.segment.local.realtime.impl.forward.FixedByteSVMutableForwardIndex} - * supports random inserts and lookups. The - * {@link org.apache.pinot.segment.local.realtime.impl.forward.VarByteSVMutableForwardIndex only supports - * sequential inserts. - */ - public static void validateIngestionAggregation(String name) { - for (AggregationFunctionType functionType : SUPPORTED_INGESTION_AGGREGATIONS) { - if (functionType.getName().toLowerCase().equals(name)) { - return; - } - } - throw new IllegalStateException( - String.format("aggregation function %s must be one of %s", name, SUPPORTED_INGESTION_AGGREGATIONS)); + public static void validateIngestionAggregation(AggregationFunctionType functionType) { + Preconditions.checkState(SUPPORTED_INGESTION_AGGREGATIONS.contains(functionType), + "Aggregation function: %s must be one of: %s", functionType, SUPPORTED_INGESTION_AGGREGATIONS); } @VisibleForTesting diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java index 53e25f47017a..5e048520c0ed 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/indexsegment/mutable/MutableSegmentImplIngestionAggregationTest.java @@ -55,7 +55,7 @@ public class MutableSegmentImplIngestionAggregationTest { private static final String KEY_SEPARATOR = "\t\t"; private static final int NUM_ROWS = 10001; - private static final Schema.SchemaBuilder getSchemaBuilder() { + private static Schema.SchemaBuilder getSchemaBuilder() { return new Schema.SchemaBuilder().setSchemaName("testSchema") .addSingleValueDimension(DIMENSION_1, FieldSpec.DataType.INT) .addSingleValueDimension(DIMENSION_2, FieldSpec.DataType.STRING) @@ -138,18 +138,15 @@ public void testSameAggregationDifferentSrc() mutableSegmentImpl.destroy(); } - @Test public void testValuesAreNullThrowsException() throws Exception { String m1 = "sum1"; - Schema schema = - getSchemaBuilder().addMetric(m1, FieldSpec.DataType.INT).build(); + Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.INT).build(); MutableSegmentImpl mutableSegmentImpl = - MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), - VAR_LENGTH_SET, INVERTED_INDEX_SET, - Arrays.asList(new AggregationConfig(m1, "SUM(metric)"))); + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, Collections.singleton(m1), VAR_LENGTH_SET, + INVERTED_INDEX_SET, Collections.singletonList(new AggregationConfig(m1, "SUM(metric)"))); long seed = 2; Random random = new Random(seed); @@ -159,7 +156,7 @@ public void testValuesAreNullThrowsException() GenericRow row = getRow(random, 1); row.putValue(METRIC, null); try { -mutableSegmentImpl.index(row, defaultMetadata); + mutableSegmentImpl.index(row, defaultMetadata); Assert.fail(); } catch (NullPointerException e) { // expected @@ -169,13 +166,14 @@ public void testValuesAreNullThrowsException() } @Test - public void testDISTINCTCOUNTHLL() throws Exception { - String m1 = "metric_DISTINCTCOUNTHLL"; + public void testDistinctCountHLL() + throws Exception { + String m1 = "hll1"; + Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BYTES).build(); MutableSegmentImpl mutableSegmentImpl = - MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), - VAR_LENGTH_SET, INVERTED_INDEX_SET, - Arrays.asList(new AggregationConfig(m1, "DISTINCTCOUNTHLL(metric, 12)"))); + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, Collections.singleton(m1), VAR_LENGTH_SET, + INVERTED_INDEX_SET, Collections.singletonList(new AggregationConfig(m1, "distinctCountHLL(metric, 12)"))); Map expected = new HashMap<>(); List metrics = addRowsDistinctCountHLL(998, mutableSegmentImpl); @@ -183,20 +181,15 @@ public void testDISTINCTCOUNTHLL() throws Exception { expected.put(metric.getKey(), (HLLTestData) metric.getValue()); } - List arguments = - List.of( - ExpressionContext.forIdentifier("distinctcounthll"), - ExpressionContext.forLiteralContext(Literal.stringValue("12")) - ); + List arguments = Arrays.asList(ExpressionContext.forIdentifier("metric"), + ExpressionContext.forLiteralContext(Literal.stringValue("12"))); DistinctCountHLLValueAggregator valueAggregator = new DistinctCountHLLValueAggregator(arguments); Set integers = new HashSet<>(); - /* - Assert that the distinct count is within an error margin. We assert on the cardinality of the HLL in the docID - and the HLL we made, but also on the cardinality of the HLL in the docID and the actual cardinality from - the set of integers. - */ + // Assert that the distinct count is within an error margin. We assert on the cardinality of the HLL in the docID + // and the HLL we made, but also on the cardinality of the HLL in the docID and the actual cardinality from the set + // of integers. GenericRow reuse = new GenericRow(); for (int docId = 0; docId < expected.size(); docId++) { GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); @@ -216,9 +209,7 @@ public void testDISTINCTCOUNTHLL() throws Exception { + "the integers."); } - /* - Assert that the aggregated HyperLogLog is also within the error margin - */ + // Assert that the aggregated HyperLogLog is also within the error margin HyperLogLog togetherHLL = new HyperLogLog(12); expected.forEach((key, value) -> { try { @@ -236,7 +227,7 @@ public void testDISTINCTCOUNTHLL() throws Exception { } @Test - public void testCOUNT() + public void testCount() throws Exception { String m1 = "count1"; String m2 = "count2"; @@ -250,8 +241,7 @@ public void testCOUNT() Map expectedCount = new HashMap<>(); for (List metrics : addRows(3, mutableSegmentImpl)) { - expectedCount.put(metrics.get(0).getKey(), - expectedCount.getOrDefault(metrics.get(0).getKey(), 0L) + 1L); + expectedCount.put(metrics.get(0).getKey(), expectedCount.getOrDefault(metrics.get(0).getKey(), 0L) + 1L); } GenericRow reuse = new GenericRow(); @@ -361,7 +351,7 @@ private List addRowsDistinctCountHLL(long seed, MutableSegmentImpl mutab return metrics; } - private List addRowsSUMPRECISION(long seed, MutableSegmentImpl mutableSegmentImpl) + private List addRowsSumPrecision(long seed, MutableSegmentImpl mutableSegmentImpl) throws Exception { List metrics = new ArrayList<>(); @@ -371,9 +361,8 @@ private List addRowsSUMPRECISION(long seed, MutableSegmentImpl mutableSe HashMap bdMap = new HashMap<>(); HashMap> bdIndividualMap = new HashMap<>(); - Integer rows = 50000; - - for (int i = 0; i < (rows); i++) { + int numRows = 50000; + for (int i = 0; i < numRows; i++) { GenericRow row = getRow(random, 1); String key = buildKey(row); @@ -411,7 +400,6 @@ private List> addRows(long seed, MutableSegmentImpl mutableSegmentI List> metrics = new ArrayList<>(); Set keys = new HashSet<>(); - Random random = new Random(seed); StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), new GenericRow()); @@ -440,25 +428,24 @@ private List> addRows(long seed, MutableSegmentImpl mutableSegmentI } @Test - public void testSUMPRECISION() throws Exception { - String m1 = "metric_SUMPRECISION"; + public void testSumPrecision() + throws Exception { + String m1 = "sumPrecision1"; Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BIG_DECIMAL).build(); MutableSegmentImpl mutableSegmentImpl = - MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), - VAR_LENGTH_SET, INVERTED_INDEX_SET, - // Setting precision to 38 in the arguments for SUMPRECISION - Arrays.asList(new AggregationConfig(m1, "SUMPRECISION(metric, 38)"))); + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, Collections.singleton(m1), VAR_LENGTH_SET, + INVERTED_INDEX_SET, + // Setting precision to 38 in the arguments for SUM_PRECISION + Collections.singletonList(new AggregationConfig(m1, "SUM_PRECISION(metric, 38)"))); Map expected = new HashMap<>(); - List metrics = addRowsSUMPRECISION(998, mutableSegmentImpl); + List metrics = addRowsSumPrecision(998, mutableSegmentImpl); for (Metric metric : metrics) { expected.put(metric.getKey(), (BigDecimal) metric.getValue()); } - /* - Assert that the aggregated values are correct - */ + // Assert that the aggregated values are correct GenericRow reuse = new GenericRow(); for (int docId = 0; docId < expected.size(); docId++) { GenericRow row = mutableSegmentImpl.getRecord(docId, reuse); @@ -467,15 +454,14 @@ public void testSUMPRECISION() throws Exception { BigDecimal expectedBigDecimal = expected.get(key); BigDecimal actualBigDecimal = (BigDecimal) row.getValue(m1); - Assert.assertEquals(actualBigDecimal, expectedBigDecimal, - "The aggregated SUM does not match the expected SUM"); + Assert.assertEquals(actualBigDecimal, expectedBigDecimal, "The aggregated SUM does not match the expected SUM"); } mutableSegmentImpl.destroy(); } @Test public void testBigDecimalTooBig() { - String m1 = "metric_SUMPRECISION"; + String m1 = "sumPrecision1"; Schema schema = getSchemaBuilder().addMetric(m1, FieldSpec.DataType.BIG_DECIMAL).build(); int seed = 1; @@ -483,9 +469,8 @@ public void testBigDecimalTooBig() { StreamMessageMetadata defaultMetadata = new StreamMessageMetadata(System.currentTimeMillis(), null); MutableSegmentImpl mutableSegmentImpl = - MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, new HashSet<>(Arrays.asList(m1)), - VAR_LENGTH_SET, INVERTED_INDEX_SET, - Arrays.asList(new AggregationConfig(m1, "SUMPRECISION(metric, 3)"))); + MutableSegmentImplTestUtils.createMutableSegmentImpl(schema, Collections.singleton(m1), VAR_LENGTH_SET, + INVERTED_INDEX_SET, Collections.singletonList(new AggregationConfig(m1, "SUM_PRECISION(metric, 3)"))); // Make a big decimal larger than 3 precision and try to index it BigDecimal large = BigDecimalUtils.generateMaximumNumberWithPrecision(5); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java index aa12c792ace6..6b17e906a55a 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/forward/mutable/FixedByteSVMutableForwardIndexTest.java @@ -115,7 +115,8 @@ private void testDictId(final Random random, final int rows, final int div) } @Test - public void testBytes() throws IOException { + public void testBytes() + throws IOException { int rows = 10; Random r = new Random(); final long seed = r.nextLong(); @@ -125,7 +126,8 @@ public void testBytes() throws IOException { } } - private void testBytes(final Random random, final int rows, final int div) throws IOException { + private void testBytes(final Random random, final int rows, final int div) + throws IOException { int hllLog2M12Size = 2740; int log2m = 12; diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/HyperLogLogUtilsTest.java similarity index 60% rename from pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java rename to pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/HyperLogLogUtilsTest.java index 711b037e43cc..44f7b741e580 100644 --- a/pinot-spi/src/test/java/org/apache/pinot/spi/utils/HyperLogLogUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/HyperLogLogUtilsTest.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.pinot.spi.utils; +package org.apache.pinot.segment.local.utils; import com.clearspring.analytics.stream.cardinality.HyperLogLog; import java.io.IOException; @@ -24,29 +24,17 @@ import static org.testng.Assert.assertEquals; -public class HyperLogLogUtilsTest { - @Test - public void testByteSizeLog2M() - throws IOException { - int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; - for (int log2m : testCases) { - assertEquals( - HyperLogLogUtils.byteSize(log2m), - (new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)).getBytes().length - ); - } - } +public class HyperLogLogUtilsTest { @Test - public void testByteSizeWithHLLObject() + public void testByteSize() throws IOException { - int[] testCases = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; - for (int log2m : testCases) { - assertEquals( - HyperLogLogUtils.byteSize(new com.clearspring.analytics.stream.cardinality.HyperLogLog(log2m)), - (new HyperLogLog(log2m)).getBytes().length - ); + for (int log2m = 0; log2m < 16; log2m++) { + HyperLogLog hll = new HyperLogLog(log2m); + int expectedByteSize = hll.getBytes().length; + assertEquals(HyperLogLogUtils.byteSize(log2m), expectedByteSize); + assertEquals(HyperLogLogUtils.byteSize(hll), expectedByteSize); } } } diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java index e53c61e94459..4ec499d58be3 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/TableConfigUtilsTest.java @@ -510,23 +510,13 @@ public void ingestionAggregationConfigsTest() { } // distinctcounthll, expect that the function name in various forms (with and without underscores) still validates - schema = new Schema - .SchemaBuilder() - .setSchemaName(TABLE_NAME) - .addMetric("d1", FieldSpec.DataType.BYTES) - .addMetric("d2", FieldSpec.DataType.BYTES) - .addMetric("d3", FieldSpec.DataType.BYTES) - .addMetric("d4", FieldSpec.DataType.BYTES) - .addMetric("d5", FieldSpec.DataType.BYTES) - .build(); + schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME).addMetric("d1", FieldSpec.DataType.BYTES) + .addMetric("d2", FieldSpec.DataType.BYTES).addMetric("d3", FieldSpec.DataType.BYTES) + .addMetric("d4", FieldSpec.DataType.BYTES).addMetric("d5", FieldSpec.DataType.BYTES).build(); - aggregationConfigs = Arrays.asList( - new AggregationConfig("d1", "distinct_count_hll(s1)"), - new AggregationConfig("d2", "DISTINCTCOUNTHLL(s1)"), - new AggregationConfig("d3", "distinctcounthll(s1)"), - new AggregationConfig("d4", "DISTINCTCOUNT_HLL(s1)"), - new AggregationConfig("d5", "DISTINCT_COUNT_HLL(s1)") - ); + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "distinct_count_hll(s1)"), + new AggregationConfig("d2", "DISTINCTCOUNTHLL(s1)"), new AggregationConfig("d3", "distinctcounthll(s1)"), + new AggregationConfig("d4", "DISTINCTCOUNT_HLL(s1)"), new AggregationConfig("d5", "DISTINCT_COUNT_HLL(s1)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); tableConfig = @@ -540,11 +530,7 @@ public void ingestionAggregationConfigsTest() { } // distinctcounthll, expect not specified log2m argument to default to 8 - schema = new Schema - .SchemaBuilder() - .setSchemaName(TABLE_NAME) - .addMetric("d1", FieldSpec.DataType.BYTES) - .build(); + schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME).addMetric("d1", FieldSpec.DataType.BYTES).build(); aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "DISTINCTCOUNTHLL(s1)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); @@ -572,45 +558,27 @@ public void ingestionAggregationConfigsTest() { } // sumprecision, expect that the function name in various forms (with and without underscores) still validates - schema = new Schema - .SchemaBuilder() - .setSchemaName(TABLE_NAME) - .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL) - .addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) - .addMetric("d2", FieldSpec.DataType.BIG_DECIMAL) - .addMetric("d3", FieldSpec.DataType.BIG_DECIMAL) - .addMetric("d4", FieldSpec.DataType.BIG_DECIMAL) - .build(); + schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME) + .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL).addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d2", FieldSpec.DataType.BIG_DECIMAL).addMetric("d3", FieldSpec.DataType.BIG_DECIMAL) + .addMetric("d4", FieldSpec.DataType.BIG_DECIMAL).build(); - aggregationConfigs = Arrays.asList( - new AggregationConfig("d1", "sum_precision(s1, 10, 32)"), - new AggregationConfig("d2", "SUM_PRECISION(s1, 1)"), - new AggregationConfig("d3", "sumprecision(s1, 2)"), - new AggregationConfig("d4", "SUMPRECISION(s1, 10, 99)") - ); + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "sum_precision(s1, 10, 32)"), + new AggregationConfig("d2", "SUM_PRECISION(s1, 1)"), new AggregationConfig("d3", "sumprecision(s1, 2)"), + new AggregationConfig("d4", "SUMPRECISION(s1, 10, 99)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); tableConfig = new TableConfigBuilder(TableType.REALTIME).setTableName("myTable_REALTIME").setTimeColumnName("timeColumn") .setIngestionConfig(ingestionConfig).build(); - - try { - TableConfigUtils.validateIngestionConfig(tableConfig, schema); - } catch (IllegalStateException e) { - Assert.fail("Should not fail due to valid aggregation function", e); - } + TableConfigUtils.validateIngestionConfig(tableConfig, schema); // with too many arguments should fail - schema = new Schema - .SchemaBuilder() - .setSchemaName(TABLE_NAME) - .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL) - .addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) + schema = new Schema.SchemaBuilder().setSchemaName(TABLE_NAME) + .addSingleValueDimension("s1", FieldSpec.DataType.BIG_DECIMAL).addMetric("d1", FieldSpec.DataType.BIG_DECIMAL) .build(); - aggregationConfigs = Arrays.asList( - new AggregationConfig("d1", "sum_precision(s1, 10, 32, 99)") - ); + aggregationConfigs = Arrays.asList(new AggregationConfig("d1", "sum_precision(s1, 10, 32, 99)")); ingestionConfig.setAggregationConfigs(aggregationConfigs); tableConfig = @@ -621,8 +589,7 @@ public void ingestionAggregationConfigsTest() { TableConfigUtils.validateIngestionConfig(tableConfig, schema); Assert.fail("Should have failed with too many arguments but didn't"); } catch (IllegalStateException e) { - Assert.assertTrue(e.getMessage().contains("SUMPRECISION must specify precision (required), scale (optional)" - + " [{\"columnName\":\"d1\",\"aggregationFunction\":\"sum_precision(s1, 10, 32, 99)\"}]")); + // Expected } } diff --git a/pinot-spi/pom.xml b/pinot-spi/pom.xml index 2818afb92ed4..c63b68e95a6b 100644 --- a/pinot-spi/pom.xml +++ b/pinot-spi/pom.xml @@ -149,11 +149,6 @@ org.reflections reflections - - com.clearspring.analytics - stream - 2.7.0 - diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java b/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java index 0f16359255d8..978862e6006f 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java @@ -100,7 +100,7 @@ public abstract class FieldSpec implements Comparable, Serializable { protected DataType _dataType; protected boolean _isSingleValueField = true; - // NOTE: for STRING column, this is the max number of characters; for BYTES column, this is the fixed number of bytes + // NOTE: This only applies to STRING column, which is the max number of characters private int _maxLength = DEFAULT_MAX_LENGTH; protected Object _defaultNullValue;