Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add cat --csv --no-header option #31

Merged
merged 6 commits into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
* Built using the Rust implementation of [Parquet](https://github.com/apache/arrow-rs/tree/master/parquet) and [Arrow](https://github.com/apache/arrow-rs/tree/master/arrow)
* `pqrs` roughly means "parquet-tools in rust"


## Installation

### Recommended Method
Expand All @@ -21,6 +20,7 @@ For macOS users, `pqrs` is available as a homebrew tap.
```shell
brew install manojkarthick/tap/pqrs
```

NOTE: For users upgrading from v0.2 or prior, note that the location of the `pqrs` homebrew tap has been updated.
To update to v0.2.1+, please uninstall using `brew uninstall pqrs` and use the above command to re-install.

Expand Down Expand Up @@ -73,8 +73,8 @@ SUBCOMMANDS:

### Subcommand: cat

Prints the contents of the given files and folders. Recursively traverses and prints all the files if the input is a directory.
Supports json-like, json or CSV format. Use `--json` for JSON output and `--csv` for CSV output.
Prints the contents of the given files and folders. Recursively traverses and prints all the files if the input is a directory.
Supports json-like, json or CSV format. Use `--json` for JSON output, `--csv` for CSV output with column names in the first row, and `--csv-data-only` for CSV output without the column names row.

```shell
❯ pqrs cat data/cities.parquet
Expand All @@ -97,6 +97,12 @@ foo,bar
10,20
```

```shell
❯ pqrs cat data/simple.parquet --csv --no-header
1,2
10,20
```

NOTE: CSV format is not supported for files that contain Struct or Byte fields.

### Subcommand: head
Expand Down Expand Up @@ -207,7 +213,7 @@ statistics: {min: [69, 117, 114, 111, 112, 101], max: [78, 111, 114, 116, 104, 3
```

```shell
❯ pqrs schema --json data/cities.parquet
❯ pqrs schema --json data/cities.parquet
{"version":1,"num_rows":3,"created_by":"parquet-mr version 1.5.0-cdh5.7.0 (build ${buildNumber})","metadata":null,"columns":[{"optional":"true","physical_type":"BYTE_ARRAY","name":"continent","path":"continent","converted_type":"UTF8"},{"name":"name","converted_type":"UTF8","path":"country.name","physical_type":"BYTE_ARRAY","optional":"true"},{"optional":"true","name":"array_element","physical_type":"BYTE_ARRAY","path":"country.city.bag.array_element","converted_type":"UTF8"}],"message":"message hive_schema {\n OPTIONAL BYTE_ARRAY continent (UTF8);\n OPTIONAL group country {\n OPTIONAL BYTE_ARRAY name (UTF8);\n OPTIONAL group city (LIST) {\n REPEATED group bag {\n OPTIONAL BYTE_ARRAY array_element (UTF8);\n }\n }\n }\n}\n"}

```
Expand All @@ -232,8 +238,6 @@ File Name: data/pems-1.snappy.parquet
Compressed Size: 12 KiB
```



### TODO

* [ ] Test on Windows
6 changes: 6 additions & 0 deletions src/commands/cat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ pub struct CatCommandArgs {
#[clap(short, long, conflicts_with = "json")]
csv: bool,

/// Use CSV format without a header for printing
#[clap(long = "no-header", requires = "csv", conflicts_with = "json")]
csv_no_header: bool,

/// Use JSON lines format for printing
#[clap(short, long, conflicts_with = "csv")]
json: bool,
Expand All @@ -27,6 +31,8 @@ pub struct CatCommandArgs {
pub(crate) fn execute(opts: CatCommandArgs) -> Result<(), PQRSError> {
let format = if opts.json {
Formats::Json
} else if opts.csv_no_header {
Formats::CsvNoHeader
} else if opts.csv {
Formats::Csv
} else {
Expand Down
27 changes: 27 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static ONE_PI_B: i64 = ONE_TI_B * 1024;
pub enum Formats {
Default,
Csv,
CsvNoHeader,
Json,
}

Expand Down Expand Up @@ -119,6 +120,31 @@ pub fn print_rows(
let batch_reader = arrow_reader.get_record_reader(8192)?;
let mut writer = arrow::csv::Writer::new(std::io::stdout());

for maybe_batch in batch_reader {
if left == Some(0) {
break;
}

let mut batch = maybe_batch?;
if let Some(l) = left {
if batch.num_rows() <= l {
left = Some(l - batch.num_rows());
} else {
let n = min(batch.num_rows(), l);
batch = batch.slice(0, n);
left = Some(0);
}
};

writer.write(&batch)?;
}
}
Formats::CsvNoHeader => {
let mut arrow_reader = ParquetFileArrowReader::new(parquet_reader);
let batch_reader = arrow_reader.get_record_reader(8192)?;
let writer_builder = arrow::csv::WriterBuilder::new().has_headers(false);
let mut writer = writer_builder.build(std::io::stdout());

for maybe_batch in batch_reader {
if left == Some(0) {
break;
Expand Down Expand Up @@ -263,6 +289,7 @@ fn print_row(row: &Row, format: Formats) {
match format {
Formats::Default => println!("{}", row),
Formats::Csv => println!("Unsupported! {}", row),
Formats::CsvNoHeader => println!("Unsupported! {}", row),
Formats::Json => println!("{}", row.to_json_value()),
}
}
Expand Down
20 changes: 18 additions & 2 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ static CAT_JSON_OUTPUT: &str = r#"{"continent":"Europe","country":{"name":"Franc
static CAT_CSV_OUTPUT: &str = r#"foo,bar
1,2
10,20"#;
static CAT_CSV_NO_HEADER_OUTPUT: &str = r#"1,2
10,20"#;
static SCHEMA_OUTPUT: &str = r#"message hive_schema {
OPTIONAL BYTE_ARRAY continent (UTF8);
OPTIONAL group country {
Expand All @@ -32,8 +34,8 @@ static SAMPLE_PARTIAL_OUTPUT_2: &str = "country: {name:";
mod integration {
// make sure any new commands added have a corresponding integration test here!
use crate::{
CAT_CSV_OUTPUT, CAT_JSON_OUTPUT, CAT_OUTPUT, CITIES_PARQUET_PATH,
MERGED_FILE_NAME, PEMS_1_PARQUET_PATH, PEMS_2_PARQUET_PATH,
CAT_CSV_NO_HEADER_OUTPUT, CAT_CSV_OUTPUT, CAT_JSON_OUTPUT, CAT_OUTPUT,
CITIES_PARQUET_PATH, MERGED_FILE_NAME, PEMS_1_PARQUET_PATH, PEMS_2_PARQUET_PATH,
SAMPLE_PARTIAL_OUTPUT_1, SAMPLE_PARTIAL_OUTPUT_2, SCHEMA_OUTPUT,
SIMPLE_PARQUET_PATH,
};
Expand Down Expand Up @@ -74,6 +76,20 @@ mod integration {
Ok(())
}

#[test]
fn validate_cat_csv_no_header() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin("pqrs")?;
cmd.arg("cat")
.arg(SIMPLE_PARQUET_PATH)
.arg("--csv")
.arg("--no-header");
cmd.assert()
.success()
.stdout(predicate::str::starts_with(CAT_CSV_NO_HEADER_OUTPUT));

Ok(())
}

#[test]
fn validate_cat_directory() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin("pqrs")?;
Expand Down