|
15 | 15 | // specific language governing permissions and limitations
|
16 | 16 | // under the License.
|
17 | 17 |
|
18 |
| -//! Parquet format helper methods |
| 18 | +//! Parquet format abstractions |
19 | 19 |
|
20 | 20 | use std::fs::File;
|
21 | 21 | use std::sync::Arc;
|
22 | 22 |
|
23 | 23 | use arrow::datatypes::Schema;
|
| 24 | +use arrow::datatypes::SchemaRef; |
| 25 | +use async_trait::async_trait; |
24 | 26 | use parquet::arrow::ArrowReader;
|
25 | 27 | use parquet::arrow::ParquetFileArrowReader;
|
26 | 28 | use parquet::file::serialized_reader::SerializedFileReader;
|
27 | 29 | use parquet::file::statistics::Statistics as ParquetStatistics;
|
28 | 30 |
|
| 31 | +use super::FileFormat; |
29 | 32 | use super::{create_max_min_accs, get_col_stats};
|
30 | 33 | use crate::arrow::datatypes::{DataType, Field};
|
| 34 | +use crate::datasource::PartitionedFile; |
31 | 35 | use crate::error::Result;
|
| 36 | +use crate::logical_plan::combine_filters; |
| 37 | +use crate::logical_plan::Expr; |
32 | 38 | use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
|
| 39 | +use crate::physical_plan::parquet::ParquetExec; |
| 40 | +use crate::physical_plan::ExecutionPlan; |
33 | 41 | use crate::physical_plan::{Accumulator, Statistics};
|
34 | 42 | use crate::scalar::ScalarValue;
|
35 | 43 |
|
@@ -156,7 +164,7 @@ fn summarize_min_max(
|
156 | 164 | }
|
157 | 165 |
|
158 | 166 | /// Read and parse the metadata of the Parquet file at location `path`
|
159 |
| -pub fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> { |
| 167 | +fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> { |
160 | 168 | let file = File::open(path)?;
|
161 | 169 | let file_reader = Arc::new(SerializedFileReader::new(file)?);
|
162 | 170 | let mut arrow_reader = ParquetFileArrowReader::new(file_reader);
|
@@ -214,6 +222,57 @@ pub fn fetch_metadata(path: &str) -> Result<(Schema, Statistics)> {
|
214 | 222 | Ok((schema, statistics))
|
215 | 223 | }
|
216 | 224 |
|
| 225 | +/// The Apache Parquet `FileFormat` implementation |
| 226 | +pub struct ParquetFormat { |
| 227 | + /// Activate statistics based row group level pruning |
| 228 | + pub enable_pruning: bool, |
| 229 | +} |
| 230 | + |
| 231 | +#[async_trait] |
| 232 | +impl FileFormat for ParquetFormat { |
| 233 | + async fn infer_schema(&self, path: &str) -> Result<SchemaRef> { |
| 234 | + let (schema, _) = fetch_metadata(path)?; |
| 235 | + Ok(Arc::new(schema)) |
| 236 | + } |
| 237 | + |
| 238 | + async fn infer_stats(&self, path: &str) -> Result<Statistics> { |
| 239 | + let (_, stats) = fetch_metadata(path)?; |
| 240 | + Ok(stats) |
| 241 | + } |
| 242 | + |
| 243 | + async fn create_executor( |
| 244 | + &self, |
| 245 | + schema: SchemaRef, |
| 246 | + files: Vec<Vec<PartitionedFile>>, |
| 247 | + statistics: Statistics, |
| 248 | + projection: &Option<Vec<usize>>, |
| 249 | + batch_size: usize, |
| 250 | + filters: &[Expr], |
| 251 | + limit: Option<usize>, |
| 252 | + ) -> Result<Arc<dyn ExecutionPlan>> { |
| 253 | + // If enable pruning then combine the filters to build the predicate. |
| 254 | + // If disable pruning then set the predicate to None, thus readers |
| 255 | + // will not prune data based on the statistics. |
| 256 | + let predicate = if self.enable_pruning { |
| 257 | + combine_filters(filters) |
| 258 | + } else { |
| 259 | + None |
| 260 | + }; |
| 261 | + |
| 262 | + Ok(Arc::new(ParquetExec::try_new_refacto( |
| 263 | + files, |
| 264 | + statistics, |
| 265 | + schema, |
| 266 | + projection.clone(), |
| 267 | + predicate, |
| 268 | + limit |
| 269 | + .map(|l| std::cmp::min(l, batch_size)) |
| 270 | + .unwrap_or(batch_size), |
| 271 | + limit, |
| 272 | + )?)) |
| 273 | + } |
| 274 | +} |
| 275 | + |
217 | 276 | // #[cfg(test)]
|
218 | 277 | // mod tests {
|
219 | 278 | // use super::*;
|
|
0 commit comments