@@ -23,6 +23,7 @@ use std::{any::Any, convert::TryInto};
23
23
24
24
use crate :: datasource:: file_format:: parquet:: ChunkObjectReader ;
25
25
use crate :: datasource:: object_store:: ObjectStore ;
26
+ use crate :: datasource:: PartitionedFile ;
26
27
use crate :: {
27
28
error:: { DataFusionError , Result } ,
28
29
logical_plan:: { Column , Expr } ,
@@ -59,14 +60,13 @@ use tokio::{
59
60
60
61
use async_trait:: async_trait;
61
62
62
- use crate :: datasource:: { FilePartition , PartitionedFile } ;
63
-
64
63
/// Execution plan for scanning one or more Parquet partitions
65
64
#[ derive( Debug , Clone ) ]
66
65
pub struct ParquetExec {
67
66
object_store : Arc < dyn ObjectStore > ,
68
- /// Parquet partitions to read
69
- partitions : Vec < ParquetPartition > ,
67
+ /// Grouped list of files. Each group will be processed together by one
68
+ /// partition of the `ExecutionPlan`.
69
+ file_groups : Vec < Vec < PartitionedFile > > ,
70
70
/// Schema after projection is applied
71
71
schema : SchemaRef ,
72
72
/// Projection for which columns to load
@@ -83,23 +83,6 @@ pub struct ParquetExec {
83
83
limit : Option < usize > ,
84
84
}
85
85
86
- /// Represents one partition of a Parquet data set and this currently means one Parquet file.
87
- ///
88
- /// In the future it would be good to support subsets of files based on ranges of row groups
89
- /// so that we can better parallelize reads of large files across available cores (see
90
- /// [ARROW-10995](https://issues.apache.org/jira/browse/ARROW-10995)).
91
- ///
92
- /// We may also want to support reading Parquet files that are partitioned based on a key and
93
- /// in this case we would want this partition struct to represent multiple files for a given
94
- /// partition key (see [ARROW-11019](https://issues.apache.org/jira/browse/ARROW-11019)).
95
- #[ derive( Debug , Clone ) ]
96
- pub struct ParquetPartition {
97
- /// The Parquet filename for this partition
98
- pub file_partition : FilePartition ,
99
- /// Execution metrics
100
- metrics : ExecutionPlanMetricsSet ,
101
- }
102
-
103
86
/// Stores metrics about the parquet execution for a particular parquet file
104
87
#[ derive( Debug , Clone ) ]
105
88
struct ParquetFileMetrics {
@@ -115,24 +98,16 @@ impl ParquetExec {
115
98
#[ allow( clippy:: too_many_arguments) ]
116
99
pub fn new (
117
100
object_store : Arc < dyn ObjectStore > ,
118
- files : Vec < Vec < PartitionedFile > > ,
101
+ file_groups : Vec < Vec < PartitionedFile > > ,
119
102
statistics : Statistics ,
120
103
schema : SchemaRef ,
121
104
projection : Option < Vec < usize > > ,
122
105
predicate : Option < Expr > ,
123
106
batch_size : usize ,
124
107
limit : Option < usize > ,
125
108
) -> Self {
126
- debug ! ( "Creating ParquetExec, desc: {:?}, projection {:?}, predicate: {:?}, limit: {:?}" ,
127
- files, projection, predicate, limit) ;
128
-
129
- let metrics = ExecutionPlanMetricsSet :: new ( ) ;
130
-
131
- let partitions = files
132
- . into_iter ( )
133
- . enumerate ( )
134
- . map ( |( i, f) | ParquetPartition :: new ( f, i, metrics. clone ( ) ) )
135
- . collect :: < Vec < _ > > ( ) ;
109
+ debug ! ( "Creating ParquetExec, files: {:?}, projection {:?}, predicate: {:?}, limit: {:?}" ,
110
+ file_groups, projection, predicate, limit) ;
136
111
137
112
let metrics = ExecutionPlanMetricsSet :: new ( ) ;
138
113
let predicate_creation_errors =
@@ -162,7 +137,7 @@ impl ParquetExec {
162
137
163
138
Self {
164
139
object_store,
165
- partitions ,
140
+ file_groups ,
166
141
schema : projected_schema,
167
142
projection,
168
143
metrics,
@@ -204,11 +179,8 @@ impl ParquetExec {
204
179
}
205
180
206
181
/// List of data files
207
- pub fn partitions ( & self ) -> Vec < & [ PartitionedFile ] > {
208
- self . partitions
209
- . iter ( )
210
- . map ( |fp| fp. file_partition . files . as_slice ( ) )
211
- . collect ( )
182
+ pub fn file_groups ( & self ) -> & [ Vec < PartitionedFile > ] {
183
+ & self . file_groups
212
184
}
213
185
/// Optional projection for which columns to load
214
186
pub fn projection ( & self ) -> & [ usize ] {
@@ -225,20 +197,6 @@ impl ParquetExec {
225
197
}
226
198
}
227
199
228
- impl ParquetPartition {
229
- /// Create a new parquet partition
230
- pub fn new (
231
- files : Vec < PartitionedFile > ,
232
- index : usize ,
233
- metrics : ExecutionPlanMetricsSet ,
234
- ) -> Self {
235
- Self {
236
- file_partition : FilePartition { index, files } ,
237
- metrics,
238
- }
239
- }
240
- }
241
-
242
200
impl ParquetFileMetrics {
243
201
/// Create new metrics
244
202
pub fn new (
@@ -279,7 +237,7 @@ impl ExecutionPlan for ParquetExec {
279
237
280
238
/// Get the output partitioning of this plan
281
239
fn output_partitioning ( & self ) -> Partitioning {
282
- Partitioning :: UnknownPartitioning ( self . partitions . len ( ) )
240
+ Partitioning :: UnknownPartitioning ( self . file_groups . len ( ) )
283
241
}
284
242
285
243
fn with_new_children (
@@ -304,7 +262,7 @@ impl ExecutionPlan for ParquetExec {
304
262
Receiver < ArrowResult < RecordBatch > > ,
305
263
) = channel ( 2 ) ;
306
264
307
- let partition = self . partitions [ partition_index] . clone ( ) ;
265
+ let partition = self . file_groups [ partition_index] . clone ( ) ;
308
266
let metrics = self . metrics . clone ( ) ;
309
267
let projection = self . projection . clone ( ) ;
310
268
let predicate_builder = self . predicate_builder . clone ( ) ;
@@ -342,18 +300,12 @@ impl ExecutionPlan for ParquetExec {
342
300
) -> std:: fmt:: Result {
343
301
match t {
344
302
DisplayFormatType :: Default => {
345
- let files: Vec < _ > = self
346
- . partitions
347
- . iter ( )
348
- . map ( |pp| format ! ( "{}" , pp. file_partition) )
349
- . collect ( ) ;
350
-
351
303
write ! (
352
304
f,
353
- "ParquetExec: batch_size={}, limit={:?}, partitions=[{}] " ,
305
+ "ParquetExec: batch_size={}, limit={:?}, partitions={} " ,
354
306
self . batch_size,
355
307
self . limit,
356
- files . join ( ", " )
308
+ super :: FileGroupsDisplay ( & self . file_groups )
357
309
)
358
310
}
359
311
}
@@ -497,7 +449,7 @@ fn build_row_group_predicate(
497
449
fn read_partition (
498
450
object_store : & dyn ObjectStore ,
499
451
partition_index : usize ,
500
- partition : ParquetPartition ,
452
+ partition : Vec < PartitionedFile > ,
501
453
metrics : ExecutionPlanMetricsSet ,
502
454
projection : & [ usize ] ,
503
455
predicate_builder : & Option < PruningPredicate > ,
@@ -506,8 +458,7 @@ fn read_partition(
506
458
limit : Option < usize > ,
507
459
) -> Result < ( ) > {
508
460
let mut total_rows = 0 ;
509
- let all_files = partition. file_partition . files ;
510
- ' outer: for partitioned_file in all_files {
461
+ ' outer: for partitioned_file in partition {
511
462
let file_metrics = ParquetFileMetrics :: new (
512
463
partition_index,
513
464
& * partitioned_file. file_meta . path ( ) ,
0 commit comments