Skip to content

Commit

Permalink
Fix string length in MutableColumnStatistics (#9059)
Browse files Browse the repository at this point in the history
- Correctly count the byte length of string instead of character length
- Support BIG_DECIMAL
- Reduce the unnecessary dictionary scan
  • Loading branch information
Jackie-Jiang authored Jul 14, 2022
1 parent a2cde56 commit 84478b6
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,18 @@
*/
package org.apache.pinot.segment.local.realtime.converter.stats;

import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.pinot.segment.spi.creator.ColumnStatistics;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.datasource.DataSourceMetadata;
import org.apache.pinot.segment.spi.index.mutable.MutableForwardIndex;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.segment.spi.partition.PartitionFunction;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.utils.BigDecimalUtils;


/**
Expand All @@ -42,7 +45,10 @@ public class MutableColumnStatistics implements ColumnStatistics {
// dictionary.
private final Dictionary _dictionary;

public MutableColumnStatistics(DataSource dataSource, int[] sortedDocIdIterationOrder) {
private int _minElementLength = -1;
private int _maxElementLength = -1;

public MutableColumnStatistics(DataSource dataSource, @Nullable int[] sortedDocIdIterationOrder) {
_dataSource = dataSource;
_sortedDocIdIterationOrder = sortedDocIdIterationOrder;
_dictionary = dataSource.getDictionary();
Expand Down Expand Up @@ -70,46 +76,57 @@ public int getCardinality() {

@Override
public int getLengthOfShortestElement() {
// Length of longest string
int minStringLength = Integer.MAX_VALUE;

// If this column is a string/bytes column, iterate over the dictionary to find the maximum length
FieldSpec.DataType dataType = _dataSource.getDataSourceMetadata().getDataType();
int length = _dictionary.length();

if (dataType.equals(FieldSpec.DataType.STRING)) {
for (int i = 0; i < length; i++) {
minStringLength = Math.min(_dictionary.getStringValue(i).length(), minStringLength);
}
} else if (dataType.equals(FieldSpec.DataType.BYTES)) {
for (int i = 0; i < length; i++) {
minStringLength = Math.min(_dictionary.getBytesValue(i).length, minStringLength);
}
}

return minStringLength;
collectElementLengthIfNeeded();
return _minElementLength;
}

@Override
public int getLengthOfLargestElement() {
// Length of longest string
int maximumStringLength = 0;
collectElementLengthIfNeeded();
return _maxElementLength;
}

// If this column is a string/bytes column, iterate over the dictionary to find the maximum length
FieldSpec.DataType dataType = _dataSource.getDataSourceMetadata().getDataType();
int length = _dictionary.length();
private void collectElementLengthIfNeeded() {
if (_minElementLength >= 0) {
return;
}

if (dataType.equals(FieldSpec.DataType.STRING)) {
for (int i = 0; i < length; i++) {
maximumStringLength = Math.max(_dictionary.getStringValue(i).length(), maximumStringLength);
}
} else if (dataType.equals(FieldSpec.DataType.BYTES)) {
for (int i = 0; i < length; i++) {
maximumStringLength = Math.max(_dictionary.getBytesValue(i).length, maximumStringLength);
}
DataType storedType = _dictionary.getValueType();
if (storedType.isFixedWidth()) {
_minElementLength = storedType.size();
_maxElementLength = storedType.size();
return;
}

return maximumStringLength;
// If the stored type is not fixed width, iterate over the dictionary to find the min/max element length
_minElementLength = Integer.MAX_VALUE;
_maxElementLength = 0;
int length = _dictionary.length();
switch (storedType) {
case BIG_DECIMAL:
for (int i = 0; i < length; i++) {
int elementLength = BigDecimalUtils.byteSize(_dictionary.getBigDecimalValue(i));
_minElementLength = Math.min(_minElementLength, elementLength);
_maxElementLength = Math.max(_maxElementLength, elementLength);
}
break;
case STRING:
for (int i = 0; i < length; i++) {
int elementLength = _dictionary.getStringValue(i).getBytes(StandardCharsets.UTF_8).length;
_minElementLength = Math.min(_minElementLength, elementLength);
_maxElementLength = Math.max(_maxElementLength, elementLength);
}
break;
case BYTES:
for (int i = 0; i < length; i++) {
int elementLength = _dictionary.getBytesValue(i).length;
_minElementLength = Math.min(_minElementLength, elementLength);
_maxElementLength = Math.max(_maxElementLength, elementLength);
}
break;
default:
throw new IllegalStateException("Unsupported stored type: " + storedType);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.segment.local.realtime.converter.stats;

import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.mockito.stubbing.Answer;
import org.testng.annotations.Test;

import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;


public class MutableColumnStatisticsTest {

@Test
public void testElementLength() {
int numElements = 10;
String[] elements = new String[numElements];
int minElementLength = Integer.MAX_VALUE;
int maxElementLength = 0;
for (int i = 0; i < numElements; i++) {
String randomString = RandomStringUtils.random(100);
elements[i] = randomString;
int elementLength = randomString.getBytes(StandardCharsets.UTF_8).length;
minElementLength = Math.min(minElementLength, elementLength);
maxElementLength = Math.max(maxElementLength, elementLength);
}

DataSource dataSource = mock(DataSource.class);
Dictionary dictionary = mock(Dictionary.class);
when(dataSource.getDictionary()).thenReturn(dictionary);
when(dictionary.getValueType()).thenReturn(DataType.STRING);
when(dictionary.length()).thenReturn(numElements);
when(dictionary.getStringValue(anyInt())).thenAnswer(
(Answer<String>) invocation -> elements[(int) invocation.getArgument(0)]);

MutableColumnStatistics columnStatistics = new MutableColumnStatistics(dataSource, null);
assertEquals(columnStatistics.getLengthOfShortestElement(), minElementLength);
assertEquals(columnStatistics.getLengthOfLargestElement(), maxElementLength);
}
}

0 comments on commit 84478b6

Please sign in to comment.