Skip to content

Commit 69ba713

Browse files
authored
#2109 schema infer max (#2139)
* set default schema infer max record * fix unrelated issue "error: format argument must be a string literal" during `cargo test`
1 parent fa5cef8 commit 69ba713

File tree

4 files changed

+22
-8
lines changed

4 files changed

+22
-8
lines changed

datafusion/core/src/datasource/file_format/csv.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use async_trait::async_trait;
2626
use futures::StreamExt;
2727

2828
use super::FileFormat;
29+
use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
2930
use crate::error::Result;
3031
use crate::logical_plan::Expr;
3132
use crate::physical_plan::file_format::{CsvExec, FileScanConfig};
@@ -46,7 +47,7 @@ pub struct CsvFormat {
4647
impl Default for CsvFormat {
4748
fn default() -> Self {
4849
Self {
49-
schema_infer_max_rec: None,
50+
schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
5051
has_header: true,
5152
delimiter: b',',
5253
}
@@ -55,7 +56,7 @@ impl Default for CsvFormat {
5556

5657
impl CsvFormat {
5758
/// Set a limit in terms of records to scan to infer the schema
58-
/// - default to `None` (no limit)
59+
/// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
5960
pub fn with_schema_infer_max_rec(mut self, max_rec: Option<usize>) -> Self {
6061
self.schema_infer_max_rec = max_rec;
6162
self

datafusion/core/src/datasource/file_format/json.rs

+11-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ use futures::StreamExt;
3030

3131
use super::FileFormat;
3232
use super::FileScanConfig;
33+
use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
3334
use crate::error::Result;
3435
use crate::logical_plan::Expr;
3536
use crate::physical_plan::file_format::NdJsonExec;
@@ -40,14 +41,22 @@ use datafusion_data_access::object_store::{ObjectReader, ObjectReaderStream};
4041
/// The default file extension of json files
4142
pub const DEFAULT_JSON_EXTENSION: &str = ".json";
4243
/// New line delimited JSON `FileFormat` implementation.
43-
#[derive(Debug, Default)]
44+
#[derive(Debug)]
4445
pub struct JsonFormat {
4546
schema_infer_max_rec: Option<usize>,
4647
}
4748

49+
impl Default for JsonFormat {
50+
fn default() -> Self {
51+
Self {
52+
schema_infer_max_rec: Some(DEFAULT_SCHEMA_INFER_MAX_RECORD),
53+
}
54+
}
55+
}
56+
4857
impl JsonFormat {
4958
/// Set a limit in terms of records to scan to infer the schema
50-
/// - defaults to `None` (no limit)
59+
/// - defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
5160
pub fn with_schema_infer_max_rec(mut self, max_rec: Option<usize>) -> Self {
5261
self.schema_infer_max_rec = max_rec;
5362
self

datafusion/core/src/datasource/file_format/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717

1818
//! Module containing helper methods for the various file formats
1919
20+
/// default max records to scan to infer the schema
21+
pub const DEFAULT_SCHEMA_INFER_MAX_RECORD: usize = 1000;
22+
2023
pub mod avro;
2124
pub mod csv;
2225
pub mod json;

datafusion/core/src/execution/options.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use std::sync::Arc;
2121

2222
use arrow::datatypes::{Schema, SchemaRef};
2323

24+
use crate::datasource::file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD;
2425
use crate::datasource::{
2526
file_format::{
2627
avro::{AvroFormat, DEFAULT_AVRO_EXTENSION},
@@ -44,7 +45,7 @@ pub struct CsvReadOptions<'a> {
4445
/// An optional schema representing the CSV files. If None, CSV reader will try to infer it
4546
/// based on data in file.
4647
pub schema: Option<&'a Schema>,
47-
/// Max number of rows to read from CSV files for schema inference if needed. Defaults to 1000.
48+
/// Max number of rows to read from CSV files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
4849
pub schema_infer_max_records: usize,
4950
/// File extension; only files with this extension are selected for data input.
5051
/// Defaults to DEFAULT_CSV_EXTENSION.
@@ -65,7 +66,7 @@ impl<'a> CsvReadOptions<'a> {
6566
Self {
6667
has_header: true,
6768
schema: None,
68-
schema_infer_max_records: 1000,
69+
schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD,
6970
delimiter: b',',
7071
file_extension: DEFAULT_CSV_EXTENSION,
7172
table_partition_cols: vec![],
@@ -234,7 +235,7 @@ pub struct NdJsonReadOptions<'a> {
234235
/// The data source schema.
235236
pub schema: Option<SchemaRef>,
236237

237-
/// Max number of rows to read from CSV files for schema inference if needed. Defaults to 1000.
238+
/// Max number of rows to read from JSON files for schema inference if needed. Defaults to `DEFAULT_SCHEMA_INFER_MAX_RECORD`.
238239
pub schema_infer_max_records: usize,
239240

240241
/// File extension; only files with this extension are selected for data input.
@@ -248,7 +249,7 @@ impl<'a> Default for NdJsonReadOptions<'a> {
248249
fn default() -> Self {
249250
Self {
250251
schema: None,
251-
schema_infer_max_records: 1000,
252+
schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD,
252253
file_extension: DEFAULT_JSON_EXTENSION,
253254
table_partition_cols: vec![],
254255
}

0 commit comments

Comments
 (0)