From 89179ee74569a40c2ca433aea8a851d4a75fee06 Mon Sep 17 00:00:00 2001 From: Jeffrey Barczewski Date: Mon, 1 Aug 2022 17:50:16 -0500 Subject: [PATCH 1/2] add cat --csv-no-header option `pqrs cat --csv-no-header parquetFile` will output a csv without the headeradd cat --csv-no-header option --- README.md | 8 +++++++- src/commands/cat.rs | 10 ++++++++-- src/utils.rs | 27 +++++++++++++++++++++++++++ tests/integration.rs | 19 +++++++++++++++++-- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d39558c..bdd1fd1 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ SUBCOMMANDS: ### Subcommand: cat Prints the contents of the given files and folders. Recursively traverses and prints all the files if the input is a directory. -Supports json-like, json or CSV format. Use `--json` for JSON output and `--csv` for CSV output. +Supports json-like, json or CSV format. Use `--json` for JSON output, `--csv` for CSV output with column names in the first row, and `--csv-data-only` for CSV output without the column names row. ```shell ❯ pqrs cat data/cities.parquet @@ -97,6 +97,12 @@ foo,bar 10,20 ``` +```shell +❯ pqrs cat data/simple.parquet --csv-no-header +1,2 +10,20 +``` + NOTE: CSV format is not supported for files that contain Struct or Byte fields. ### Subcommand: head diff --git a/src/commands/cat.rs b/src/commands/cat.rs index b8be085..d3c3f96 100644 --- a/src/commands/cat.rs +++ b/src/commands/cat.rs @@ -13,11 +13,15 @@ use walkdir::WalkDir; #[derive(Parser, Debug)] pub struct CatCommandArgs { /// Use CSV format for printing - #[clap(short, long, conflicts_with = "json")] + #[clap(short, long, conflicts_with_all = &["csv-no-header","json"])] csv: bool, + /// Use CSV format without a header for printing + #[clap(long = "csv-no-header", conflicts_with_all = &["csv","json"])] + csv_no_header: bool, + /// Use JSON lines format for printing - #[clap(short, long, conflicts_with = "csv")] + #[clap(short, long, conflicts_with_all = &["csv", "csv-no-header"])] json: bool, /// Parquet files or folders to read from @@ -29,6 +33,8 @@ pub(crate) fn execute(opts: CatCommandArgs) -> Result<(), PQRSError> { Formats::Json } else if opts.csv { Formats::Csv + } else if opts.csv_no_header { + Formats::CsvNoHeader } else { Formats::Default }; diff --git a/src/utils.rs b/src/utils.rs index e1d66f6..4441261 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -26,6 +26,7 @@ static ONE_PI_B: i64 = ONE_TI_B * 1024; pub enum Formats { Default, Csv, + CsvNoHeader, Json, } @@ -119,6 +120,31 @@ pub fn print_rows( let batch_reader = arrow_reader.get_record_reader(8192)?; let mut writer = arrow::csv::Writer::new(std::io::stdout()); + for maybe_batch in batch_reader { + if left == Some(0) { + break; + } + + let mut batch = maybe_batch?; + if let Some(l) = left { + if batch.num_rows() <= l { + left = Some(l - batch.num_rows()); + } else { + let n = min(batch.num_rows(), l); + batch = batch.slice(0, n); + left = Some(0); + } + }; + + writer.write(&batch)?; + } + } + Formats::CsvNoHeader => { + let mut arrow_reader = ParquetFileArrowReader::new(parquet_reader); + let batch_reader = arrow_reader.get_record_reader(8192)?; + let writer_builder = arrow::csv::WriterBuilder::new().has_headers(false); + let mut writer = writer_builder.build(std::io::stdout()); + for maybe_batch in batch_reader { if left == Some(0) { break; @@ -263,6 +289,7 @@ fn print_row(row: &Row, format: Formats) { match format { Formats::Default => println!("{}", row), Formats::Csv => println!("Unsupported! {}", row), + Formats::CsvNoHeader => println!("Unsupported! {}", row), Formats::Json => println!("{}", row.to_json_value()), } } diff --git a/tests/integration.rs b/tests/integration.rs index 9fa95d1..691aa37 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -14,6 +14,8 @@ static CAT_JSON_OUTPUT: &str = r#"{"continent":"Europe","country":{"name":"Franc static CAT_CSV_OUTPUT: &str = r#"foo,bar 1,2 10,20"#; +static CAT_CSV_NO_HEADER_OUTPUT: &str = r#"1,2 +10,20"#; static SCHEMA_OUTPUT: &str = r#"message hive_schema { OPTIONAL BYTE_ARRAY continent (UTF8); OPTIONAL group country { @@ -32,8 +34,8 @@ static SAMPLE_PARTIAL_OUTPUT_2: &str = "country: {name:"; mod integration { // make sure any new commands added have a corresponding integration test here! use crate::{ - CAT_CSV_OUTPUT, CAT_JSON_OUTPUT, CAT_OUTPUT, CITIES_PARQUET_PATH, - MERGED_FILE_NAME, PEMS_1_PARQUET_PATH, PEMS_2_PARQUET_PATH, + CAT_CSV_NO_HEADER_OUTPUT, CAT_CSV_OUTPUT, CAT_JSON_OUTPUT, CAT_OUTPUT, + CITIES_PARQUET_PATH, MERGED_FILE_NAME, PEMS_1_PARQUET_PATH, PEMS_2_PARQUET_PATH, SAMPLE_PARTIAL_OUTPUT_1, SAMPLE_PARTIAL_OUTPUT_2, SCHEMA_OUTPUT, SIMPLE_PARQUET_PATH, }; @@ -74,6 +76,19 @@ mod integration { Ok(()) } + #[test] + fn validate_cat_csv_no_header() -> Result<(), Box> { + let mut cmd = Command::cargo_bin("pqrs")?; + cmd.arg("cat") + .arg(SIMPLE_PARQUET_PATH) + .arg("--csv-no-header"); + cmd.assert() + .success() + .stdout(predicate::str::starts_with(CAT_CSV_NO_HEADER_OUTPUT)); + + Ok(()) + } + #[test] fn validate_cat_directory() -> Result<(), Box> { let mut cmd = Command::cargo_bin("pqrs")?; From beb7187379ba0ab01712cdaba09893756c9e35bd Mon Sep 17 00:00:00 2001 From: Jeffrey Barczewski Date: Tue, 2 Aug 2022 13:57:00 -0500 Subject: [PATCH 2/2] make --no-header an additional argument to --csv Instead of combining the argument as `--csv-no-header`, make `--no-header` an additional flag for `--csv`. For example: `pqrs cat --csv --no-header parquetFile` --- README.md | 2 +- src/commands/cat.rs | 10 +++++----- tests/integration.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index bdd1fd1..68991e2 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ foo,bar ``` ```shell -❯ pqrs cat data/simple.parquet --csv-no-header +❯ pqrs cat data/simple.parquet --csv --no-header 1,2 10,20 ``` diff --git a/src/commands/cat.rs b/src/commands/cat.rs index d3c3f96..8ccfaaa 100644 --- a/src/commands/cat.rs +++ b/src/commands/cat.rs @@ -13,15 +13,15 @@ use walkdir::WalkDir; #[derive(Parser, Debug)] pub struct CatCommandArgs { /// Use CSV format for printing - #[clap(short, long, conflicts_with_all = &["csv-no-header","json"])] + #[clap(short, long, conflicts_with = "json")] csv: bool, /// Use CSV format without a header for printing - #[clap(long = "csv-no-header", conflicts_with_all = &["csv","json"])] + #[clap(long = "no-header", requires = "csv", conflicts_with = "json")] csv_no_header: bool, /// Use JSON lines format for printing - #[clap(short, long, conflicts_with_all = &["csv", "csv-no-header"])] + #[clap(short, long, conflicts_with = "csv")] json: bool, /// Parquet files or folders to read from @@ -31,10 +31,10 @@ pub struct CatCommandArgs { pub(crate) fn execute(opts: CatCommandArgs) -> Result<(), PQRSError> { let format = if opts.json { Formats::Json - } else if opts.csv { - Formats::Csv } else if opts.csv_no_header { Formats::CsvNoHeader + } else if opts.csv { + Formats::Csv } else { Formats::Default }; diff --git a/tests/integration.rs b/tests/integration.rs index 691aa37..2e4e9aa 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -81,7 +81,8 @@ mod integration { let mut cmd = Command::cargo_bin("pqrs")?; cmd.arg("cat") .arg(SIMPLE_PARQUET_PATH) - .arg("--csv-no-header"); + .arg("--csv") + .arg("--no-header"); cmd.assert() .success() .stdout(predicate::str::starts_with(CAT_CSV_NO_HEADER_OUTPUT));