Skip to content
This repository was archived by the owner on Feb 18, 2024. It is now read-only.

Commit ce3b0e9

Browse files
authored
Added support to read dict-encoded required primitive types from parquet (#402)
1 parent 227ab3b commit ce3b0e9

File tree

2 files changed

+46
-0
lines changed

2 files changed

+46
-0
lines changed

src/io/parquet/read/primitive/basic.rs

+36
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,32 @@ fn read_dict_buffer_optional<T, A, F>(
7272
}
7373
}
7474

75+
fn read_dict_buffer_required<T, A, F>(
76+
indices_buffer: &[u8],
77+
additional: usize,
78+
dict: &PrimitivePageDict<T>,
79+
values: &mut MutableBuffer<A>,
80+
validity: &mut MutableBitmap,
81+
op: F,
82+
) where
83+
T: NativeType,
84+
A: ArrowNativeType,
85+
F: Fn(T) -> A,
86+
{
87+
let dict_values = dict.values();
88+
89+
// SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32),
90+
// SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width).
91+
let bit_width = indices_buffer[0];
92+
let indices_buffer = &indices_buffer[1..];
93+
94+
let indices = hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, additional);
95+
96+
values.extend(indices.map(|index| op(dict_values[index as usize])));
97+
98+
validity.extend_constant(additional, true);
99+
}
100+
75101
fn read_nullable<T, A, F>(
76102
validity_buffer: &[u8],
77103
values_buffer: &[u8],
@@ -170,6 +196,16 @@ where
170196
op,
171197
)
172198
}
199+
(Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => {
200+
read_dict_buffer_required(
201+
values_buffer,
202+
additional,
203+
dict.as_any().downcast_ref().unwrap(),
204+
values,
205+
validity,
206+
op,
207+
)
208+
}
173209
// it can happen that there is a dictionary but the encoding is plain because
174210
// it falled back.
175211
(Encoding::Plain, _, true) => read_nullable(

tests/it/io/parquet/read.rs

+10
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,16 @@ fn v1_int64_nullable_dict() -> Result<()> {
112112
test_pyarrow_integration(0, 1, "basic", true, false)
113113
}
114114

115+
#[test]
116+
fn v2_int64_required_dict() -> Result<()> {
117+
test_pyarrow_integration(0, 2, "basic", true, true)
118+
}
119+
120+
#[test]
121+
fn v1_int64_required_dict() -> Result<()> {
122+
test_pyarrow_integration(0, 1, "basic", true, true)
123+
}
124+
115125
#[test]
116126
fn v2_utf8_nullable() -> Result<()> {
117127
test_pyarrow_integration(2, 2, "basic", false, false)

0 commit comments

Comments
 (0)