From db239e5b3aa05985b0149187c8b93b88e2285b48 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 6 Aug 2024 17:13:15 -0400 Subject: [PATCH] Add (more) Parquet Metadata Documentation (#6184) * Minor: Add (more) Parquet Metadata Documenation * fix clippy --- parquet/src/file/metadata/mod.rs | 61 ++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 86c673bbdbc6..45ef0c546ffe 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -33,6 +33,67 @@ //! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf) //! within a Row Group including encoding and compression information, //! number of values, statistics, etc. +//! +//! # APIs for working with Parquet Metadata +//! +//! The Parquet readers and writers in this crate read and write +//! metadata into parquet files. To work with metadata directly, +//! the following APIs are available. +//! +//! Reading: +//! * Read from bytes to `ParquetMetaData`: [`decode_footer`] +//! and [`decode_metadata`] +//! * Read from an `async` source to `ParquetMetadata`: [`MetadataLoader`] +//! +//! [`MetadataLoader`]: https://docs.rs/parquet/latest/parquet/arrow/async_reader/struct.MetadataLoader.html +//! [`decode_footer`]: crate::file::footer::decode_footer +//! [`decode_metadata`]: crate::file::footer::decode_metadata +//! +//! Writing: +//! * Write `ParquetMetaData` to bytes in memory: Not yet supported (see [#6002]) +//! * Writes `ParquetMetaData` to an async target: Not yet supported +//! +//! [#6002]: https://github.com/apache/arrow-rs/issues/6002 +//! +//! # Metadata Encodings and Structures +//! +//! There are three different encodings of Parquet Metadata in this crate: +//! +//! 1. `bytes`:encoded with the Thrift TCompactProtocol as defined in +//! [parquet.thrift] +//! +//! 2. [`format`]: Rust structures automatically generated by the thrift compiler +//! from [parquet.thrift]. These structures are low level and mirror +//! the thrift definitions. +//! +//! 3. [`file::metadata`] (this module): Easier to use Rust structures +//! with a more idiomatic API. Note that, confusingly, some but not all +//! of these structures have the same name as the [`format`] structures. +//! +//! [`format`]: crate::format +//! [`file::metadata`]: crate::file::metadata +//! [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift +//! +//! Graphically, this is how the different structures relate to each other: +//! +//! ```text +//! ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +//! ┌──────────────┐ │ ┌───────────────────────┐ │ +//! │ │ ColumnIndex │ ││ ParquetMetaData │ +//! └──────────────┘ │ └───────────────────────┘ │ +//! ┌──────────────┐ │ ┌────────────────┐ │┌───────────────────────┐ +//! │ ..0x24.. │ ◀────▶ │ OffsetIndex │ │ ◀────▶ │ ParquetMetaData │ │ +//! └──────────────┘ │ └────────────────┘ │└───────────────────────┘ +//! ... │ ... │ +//! │ ┌──────────────────┐ │ ┌──────────────────┐ +//! bytes │ FileMetaData* │ │ │ FileMetaData* │ │ +//! (thrift encoded) │ └──────────────────┘ │ └──────────────────┘ +//! ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ +//! +//! format::meta structures file::metadata structures +//! +//! * Same name, different struct +//! ``` mod memory; use std::ops::Range;