Skip to content

Commit 4b7eabc

Browse files
committed
Revert dictionary lookup for indexed fields
1 parent 40f8eb5 commit 4b7eabc

File tree

4 files changed

+11
-94
lines changed

4 files changed

+11
-94
lines changed

datafusion/Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ unicode_expressions = ["unicode-segmentation"]
4545
# Used for testing ONLY: causes all values to hash to the same value (test for collisions)
4646
force_hash_collisions = []
4747
# Used to enable the avro format
48-
avro = ["avro-rs"]
48+
avro = ["avro-rs", "num-traits"]
4949

5050
[dependencies]
5151
ahash = "0.7"
@@ -72,7 +72,7 @@ lazy_static = { version = "^1.4.0", optional = true }
7272
smallvec = { version = "1.6", features = ["union"] }
7373
rand = "0.8"
7474
avro-rs = { version = "0.13", features = ["snappy"], optional = true }
75-
num-traits = { version = "0.2" }
75+
num-traits = { version = "0.2", optional = true }
7676

7777
[dev-dependencies]
7878
criterion = "0.3"

datafusion/src/field_util.rs

-15
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,6 @@ use crate::scalar::ScalarValue;
2929
/// * there is no field key is not of the required index type
3030
pub fn get_indexed_field(data_type: &DataType, key: &ScalarValue) -> Result<Field> {
3131
match (data_type, key) {
32-
(DataType::Dictionary(ref kt, ref _vt), ScalarValue::Utf8(Some(k))) => {
33-
match kt.as_ref() {
34-
DataType::Int8 | DataType::Int16 |DataType::Int32 |DataType::Int64 |DataType::UInt8
35-
| DataType::UInt16 | DataType::UInt32 | DataType::UInt64 => {
36-
Ok(Field::new(k, *kt.clone(), true))
37-
},
38-
_ => Err(DataFusionError::Plan(format!("The key for a dictionary has to be a primitive type, was : \"{}\"", key))),
39-
}
40-
},
41-
(DataType::Dictionary(_, _), _) => {
42-
Err(DataFusionError::Plan(
43-
"Only utf8 types are valid for dictionary lookup"
44-
.to_string(),
45-
))
46-
}
4732
(DataType::List(lt), ScalarValue::Int64(Some(i))) => {
4833
Ok(Field::new(&i.to_string(), lt.data_type().clone(), false))
4934
}

datafusion/src/physical_plan/expressions/get_indexed_field.rs

+2-46
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,14 @@ use arrow::{
2626

2727
use crate::arrow::array::Array;
2828
use crate::arrow::compute::concat;
29-
use crate::arrow::datatypes::{
30-
Int16Type, Int32Type, Int64Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
31-
};
3229
use crate::scalar::ScalarValue;
3330
use crate::{
3431
error::DataFusionError,
3532
error::Result,
3633
field_util::get_indexed_field as get_data_type_field,
3734
physical_plan::{ColumnarValue, PhysicalExpr},
3835
};
39-
use arrow::array::{ArrayRef, DictionaryArray, ListArray};
40-
use arrow::datatypes::{ArrowPrimitiveType, Int8Type};
41-
use num_traits::ToPrimitive;
36+
use arrow::array::ListArray;
4237
use std::fmt::Debug;
4338

4439
/// expression to get a field of a struct array.
@@ -96,25 +91,8 @@ impl PhysicalExpr for GetIndexedFieldExpr {
9691
let iter = concat(vec.as_slice()).unwrap();
9792
Ok(ColumnarValue::Array(iter))
9893
}
99-
(DataType::Dictionary(ref kt, _), ScalarValue::Utf8(Some(s))) => {
100-
match **kt {
101-
DataType::Int8 => dict_lookup::<Int8Type>(array, s),
102-
DataType::Int16 => dict_lookup::<Int16Type>(array, s),
103-
DataType::Int32 => dict_lookup::<Int32Type>(array, s),
104-
DataType::Int64 => dict_lookup::<Int64Type>(array, s),
105-
DataType::UInt8 => dict_lookup::<UInt8Type>(array, s),
106-
DataType::UInt16 => dict_lookup::<UInt16Type>(array, s),
107-
DataType::UInt32 => dict_lookup::<UInt32Type>(array, s),
108-
DataType::UInt64 => dict_lookup::<UInt64Type>(array, s),
109-
_ => Err(DataFusionError::NotImplemented(
110-
"dictionary lookup only available for numeric keys"
111-
.to_string(),
112-
)),
113-
}
114-
}
11594
_ => Err(DataFusionError::NotImplemented(
116-
"get indexed field is only possible on dictionary and list"
117-
.to_string(),
95+
"get indexed field is only possible on lists".to_string(),
11896
)),
11997
},
12098
ColumnarValue::Scalar(_) => Err(DataFusionError::NotImplemented(
@@ -131,25 +109,3 @@ pub fn get_indexed_field(
131109
) -> Result<Arc<dyn PhysicalExpr>> {
132110
Ok(Arc::new(GetIndexedFieldExpr::new(arg, key)))
133111
}
134-
135-
fn dict_lookup<T: ArrowPrimitiveType>(
136-
array: ArrayRef,
137-
lookup: &str,
138-
) -> Result<ColumnarValue>
139-
where
140-
T::Native: num_traits::cast::ToPrimitive,
141-
{
142-
let as_dict_array = array.as_any().downcast_ref::<DictionaryArray<T>>().unwrap();
143-
if let Some(index) = as_dict_array.lookup_key(lookup) {
144-
Ok(ColumnarValue::Array(
145-
as_dict_array
146-
.keys()
147-
.slice(ToPrimitive::to_usize(&index).unwrap(), 1),
148-
))
149-
} else {
150-
Err(DataFusionError::NotImplemented(format!(
151-
"key not found in dictionary for : {}",
152-
lookup
153-
)))
154-
}
155-
}

datafusion/tests/sql.rs

+7-31
Original file line numberDiff line numberDiff line change
@@ -5168,18 +5168,12 @@ async fn query_nested_get_indexed_field() -> Result<()> {
51685168
let mut ctx = ExecutionContext::new();
51695169
let nested_dt = DataType::List(Box::new(Field::new("item", DataType::Int64, true)));
51705170
// Nested schema of { "some_list": [[i64]] }
5171-
let schema = Arc::new(Schema::new(vec![
5172-
Field::new(
5173-
"some_list",
5174-
DataType::List(Box::new(Field::new("item", nested_dt.clone(), true))),
5175-
false,
5176-
),
5177-
Field::new(
5178-
"some_dict",
5179-
DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)),
5180-
false,
5181-
),
5182-
]));
5171+
let schema = Arc::new(Schema::new(vec![Field::new(
5172+
"some_list",
5173+
DataType::List(Box::new(Field::new("item", nested_dt.clone(), true))),
5174+
false,
5175+
)]));
5176+
51835177
let builder = PrimitiveBuilder::<Int64Type>::new(3);
51845178
let nested_lb = ListBuilder::new(builder);
51855179
let mut lb = ListBuilder::new(nested_lb);
@@ -5199,20 +5193,7 @@ async fn query_nested_get_indexed_field() -> Result<()> {
51995193
lb.append(true).unwrap();
52005194
}
52015195

5202-
let dictionary_values = StringArray::from(vec![Some("a"), Some("b"), Some("c")]);
5203-
let mut sb = StringDictionaryBuilder::new_with_dictionary(
5204-
PrimitiveBuilder::<Int64Type>::new(3),
5205-
&dictionary_values,
5206-
)
5207-
.unwrap();
5208-
for s in &["b", "a", "c"] {
5209-
sb.append(s).unwrap();
5210-
}
5211-
5212-
let data = RecordBatch::try_new(
5213-
schema.clone(),
5214-
vec![Arc::new(lb.finish()), Arc::new(sb.finish())],
5215-
)?;
5196+
let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(lb.finish())])?;
52165197
let table = MemTable::try_new(schema, vec![vec![data]])?;
52175198
let table_a = Arc::new(table);
52185199

@@ -5227,9 +5208,4 @@ async fn query_nested_get_indexed_field() -> Result<()> {
52275208
let actual = execute(&mut ctx, sql).await;
52285209
let expected = vec![vec!["0"], vec!["5"], vec!["11"]];
52295210
assert_eq!(expected, actual);
5230-
let sql = r#"SELECT some_dict["b"], some_dict["a"] FROM ints LIMIT 3"#;
5231-
let actual = execute(&mut ctx, sql).await;
5232-
let expected = vec![vec!["0", "1"]];
5233-
assert_eq!(expected, actual);
5234-
Ok(())
52355211
}

0 commit comments

Comments
 (0)