Skip to content

Commit 0264171

Browse files
committed
[fix] replace enum with trait for extensibility
1 parent 442023b commit 0264171

File tree

6 files changed

+330
-232
lines changed

6 files changed

+330
-232
lines changed

ballista/rust/core/proto/ballista.proto

+3-3
Original file line numberDiff line numberDiff line change
@@ -943,14 +943,14 @@ message GetFileMetadataParams {
943943
FileType file_type = 2;
944944
}
945945

946-
message ParquetConfig {
947-
// fields of datasource::listing::FormatOptions::Parquet
946+
message ParquetFormat {
947+
// fields of datasource::format::parquet::ParquetFormat
948948
}
949949

950950
message ListingConfig {
951951
string extension = 1;
952952
oneof format {
953-
ParquetConfig parquet = 2;
953+
ParquetFormat parquet = 2;
954954
// csv, json, ...
955955
}
956956
}
+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! CSV format abstractions
19+
20+
use std::sync::Arc;
21+
22+
use arrow::datatypes::SchemaRef;
23+
use async_trait::async_trait;
24+
25+
use super::FileFormat;
26+
use crate::datasource::PartitionedFile;
27+
use crate::error::Result;
28+
use crate::logical_plan::Expr;
29+
use crate::physical_plan::ExecutionPlan;
30+
use crate::physical_plan::Statistics;
31+
32+
/// Character Separated Value `FileFormat` implementation.
33+
pub struct CsvFormat {
34+
/// Set true to indicate that the first line is a header.
35+
pub has_header: bool,
36+
/// The character seprating values within a row.
37+
pub delimiter: u8,
38+
/// If no schema was provided for the table, it will be
39+
/// infered from the data itself, this limits the number
40+
/// of lines used in the process.
41+
pub schema_infer_max_rec: Option<u64>,
42+
}
43+
44+
#[async_trait]
45+
impl FileFormat for CsvFormat {
46+
async fn infer_schema(&self, _path: &str) -> Result<SchemaRef> {
47+
todo!()
48+
}
49+
50+
async fn infer_stats(&self, _path: &str) -> Result<Statistics> {
51+
Ok(Statistics::default())
52+
}
53+
54+
async fn create_executor(
55+
&self,
56+
_schema: SchemaRef,
57+
_files: Vec<Vec<PartitionedFile>>,
58+
_statistics: Statistics,
59+
_projection: &Option<Vec<usize>>,
60+
_batch_size: usize,
61+
_filters: &[Expr],
62+
_limit: Option<usize>,
63+
) -> Result<Arc<dyn ExecutionPlan>> {
64+
todo!()
65+
}
66+
}
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Line delimited JSON format abstractions
19+
20+
use std::sync::Arc;
21+
22+
use arrow::datatypes::SchemaRef;
23+
use async_trait::async_trait;
24+
25+
use super::FileFormat;
26+
use crate::datasource::PartitionedFile;
27+
use crate::error::Result;
28+
use crate::logical_plan::Expr;
29+
use crate::physical_plan::ExecutionPlan;
30+
use crate::physical_plan::Statistics;
31+
32+
/// New line delimited JSON `FileFormat` implementation.
33+
pub struct JsonFormat {
34+
/// If no schema was provided for the table, it will be
35+
/// infered from the data itself, this limits the number
36+
/// of lines used in the process.
37+
pub schema_infer_max_rec: Option<u64>,
38+
}
39+
40+
#[async_trait]
41+
impl FileFormat for JsonFormat {
42+
async fn infer_schema(&self, _path: &str) -> Result<SchemaRef> {
43+
todo!()
44+
}
45+
46+
async fn infer_stats(&self, _path: &str) -> Result<Statistics> {
47+
Ok(Statistics::default())
48+
}
49+
50+
async fn create_executor(
51+
&self,
52+
_schema: SchemaRef,
53+
_files: Vec<Vec<PartitionedFile>>,
54+
_statistics: Statistics,
55+
_projection: &Option<Vec<usize>>,
56+
_batch_size: usize,
57+
_filters: &[Expr],
58+
_limit: Option<usize>,
59+
) -> Result<Arc<dyn ExecutionPlan>> {
60+
todo!()
61+
}
62+
}

datafusion/src/datasource/format/mod.rs

+37-3
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,50 @@
1717

1818
//! Module containing helper methods for the various file formats
1919
20-
// pub mod csv;
21-
// pub mod json;
20+
pub mod csv;
21+
pub mod json;
2222
pub mod parquet;
2323

24+
use std::sync::Arc;
25+
2426
use crate::arrow::datatypes::{Schema, SchemaRef};
27+
use crate::error::Result;
28+
use crate::logical_plan::Expr;
2529
use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
26-
use crate::physical_plan::{Accumulator, ColumnStatistics, Statistics};
30+
use crate::physical_plan::{Accumulator, ColumnStatistics, ExecutionPlan, Statistics};
2731

2832
use super::PartitionedFile;
2933

34+
use async_trait::async_trait;
35+
36+
/// This trait abstracts all the file format specific implementations
37+
/// from the `TableProvider`. This helps code re-utilization accross
38+
/// providers that support the the same file formats.
39+
#[async_trait]
40+
pub trait FileFormat: Send + Sync {
41+
/// Open the file at the given path and infer its schema
42+
async fn infer_schema(&self, path: &str) -> Result<SchemaRef>;
43+
44+
/// Open the file at the given path and infer its statistics
45+
async fn infer_stats(&self, path: &str) -> Result<Statistics>;
46+
47+
/// Take a list of files and convert it to the appropriate executor
48+
/// according to this file format.
49+
/// TODO group params into TableDescription(schema,files,stats) and
50+
/// ScanOptions(projection,batch_size,filters) to avoid too_many_arguments
51+
#[allow(clippy::too_many_arguments)]
52+
async fn create_executor(
53+
&self,
54+
schema: SchemaRef,
55+
files: Vec<Vec<PartitionedFile>>,
56+
statistics: Statistics,
57+
projection: &Option<Vec<usize>>,
58+
batch_size: usize,
59+
filters: &[Expr],
60+
limit: Option<usize>,
61+
) -> Result<Arc<dyn ExecutionPlan>>;
62+
}
63+
3064
/// Get all files as well as the summary statistic
3165
/// if the optional `limit` is provided, includes only sufficient files
3266
/// needed to read up to `limit` number of rows

datafusion/src/datasource/format/parquet.rs

+61-2
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,29 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! Parquet format helper methods
18+
//! Parquet format abstractions
1919
2020
use std::fs::File;
2121
use std::sync::Arc;
2222

2323
use arrow::datatypes::Schema;
24+
use arrow::datatypes::SchemaRef;
25+
use async_trait::async_trait;
2426
use parquet::arrow::ArrowReader;
2527
use parquet::arrow::ParquetFileArrowReader;
2628
use parquet::file::serialized_reader::SerializedFileReader;
2729
use parquet::file::statistics::Statistics as ParquetStatistics;
2830

31+
use super::FileFormat;
2932
use super::{create_max_min_accs, get_col_stats};
3033
use crate::arrow::datatypes::{DataType, Field};
34+
use crate::datasource::PartitionedFile;
3135
use crate::error::Result;
36+
use crate::logical_plan::combine_filters;
37+
use crate::logical_plan::Expr;
3238
use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
39+
use crate::physical_plan::parquet::ParquetExec;
40+
use crate::physical_plan::ExecutionPlan;
3341
use crate::physical_plan::{Accumulator, Statistics};
3442
use crate::scalar::ScalarValue;
3543

@@ -156,7 +164,7 @@ fn summarize_min_max(
156164
}
157165

158166
/// Read and parse the metadata of the Parquet file at location `path`
159-
pub fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> {
167+
fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> {
160168
let file = File::open(path)?;
161169
let file_reader = Arc::new(SerializedFileReader::new(file)?);
162170
let mut arrow_reader = ParquetFileArrowReader::new(file_reader);
@@ -214,6 +222,57 @@ pub fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> {
214222
Ok((schema, statistics))
215223
}
216224

225+
/// The Apache Parquet `FileFormat` implementation
226+
pub struct ParquetFormat {
227+
/// Activate statistics based row group level pruning
228+
pub enable_pruning: bool,
229+
}
230+
231+
#[async_trait]
232+
impl FileFormat for ParquetFormat {
233+
async fn infer_schema(&self, path: &str) -> Result<SchemaRef> {
234+
let (schema, _) = fetch_metadata(path)?;
235+
Ok(Arc::new(schema))
236+
}
237+
238+
async fn infer_stats(&self, path: &str) -> Result<Statistics> {
239+
let (_, stats) = fetch_metadata(path)?;
240+
Ok(stats)
241+
}
242+
243+
async fn create_executor(
244+
&self,
245+
schema: SchemaRef,
246+
files: Vec<Vec<PartitionedFile>>,
247+
statistics: Statistics,
248+
projection: &Option<Vec<usize>>,
249+
batch_size: usize,
250+
filters: &[Expr],
251+
limit: Option<usize>,
252+
) -> Result<Arc<dyn ExecutionPlan>> {
253+
// If enable pruning then combine the filters to build the predicate.
254+
// If disable pruning then set the predicate to None, thus readers
255+
// will not prune data based on the statistics.
256+
let predicate = if self.enable_pruning {
257+
combine_filters(filters)
258+
} else {
259+
None
260+
};
261+
262+
Ok(Arc::new(ParquetExec::try_new_refacto(
263+
files,
264+
statistics,
265+
schema,
266+
projection.clone(),
267+
predicate,
268+
limit
269+
.map(|l| std::cmp::min(l, batch_size))
270+
.unwrap_or(batch_size),
271+
limit,
272+
)?))
273+
}
274+
}
275+
217276
// #[cfg(test)]
218277
// mod tests {
219278
// use super::*;

0 commit comments

Comments
 (0)