Skip to content

Commit 53a31d1

Browse files
authored
ORC writer API changes for granular statistics (#10058)
Depends on #10041. The erstwhile ORC writer API exposed only a binary choice to choose the level of statistics: ENABLED/DISABLED. This commit allows the ORC writer to further choose whether statistics are collected at the ROW_GROUP or STRIPE level. This commit also includes the relevant changes to `java/` and `python/`. Authors: - MithunR (https://github.com/mythrocks) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Jason Lowe (https://github.com/jlowe) - GALI PREM SAGAR (https://github.com/galipremsagar) - Christopher Harris (https://github.com/cwharris) - Vukasin Milovanovic (https://github.com/vuule) URL: #10058
1 parent 1b93126 commit 53a31d1

File tree

9 files changed

+200
-64
lines changed

9 files changed

+200
-64
lines changed

cpp/benchmarks/io/orc/orc_writer_benchmark.cpp

+14-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17+
#include "cudf/io/types.hpp"
1718
#include <benchmark/benchmark.h>
1819

1920
#include <benchmarks/common/generate_benchmark_input.hpp>
@@ -65,8 +66,14 @@ void BM_orc_write_varying_inout(benchmark::State& state)
6566

6667
void BM_orc_write_varying_options(benchmark::State& state)
6768
{
68-
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
69-
auto const enable_stats = state.range(1) != 0;
69+
auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
70+
auto const stats_freq = [&] {
71+
switch (state.range(2)) {
72+
case 0: return cudf::io::STATISTICS_NONE;
73+
case 1: return cudf::io::ORC_STATISTICS_STRIPE;
74+
default: return cudf::io::ORC_STATISTICS_ROW_GROUP;
75+
}
76+
}();
7077

7178
auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
7279
int32_t(type_group_id::FLOATING_POINT),
@@ -85,7 +92,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
8592
cudf_io::orc_writer_options const options =
8693
cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view)
8794
.compression(compression)
88-
.enable_statistics(enable_stats);
95+
.enable_statistics(stats_freq);
8996
cudf_io::write_orc(options);
9097
}
9198

@@ -113,6 +120,8 @@ BENCHMARK_DEFINE_F(OrcWrite, writer_options)
113120
BENCHMARK_REGISTER_F(OrcWrite, writer_options)
114121
->ArgsProduct({{int32_t(cudf::io::compression_type::NONE),
115122
int32_t(cudf::io::compression_type::SNAPPY)},
116-
{0, 1}})
123+
{int32_t{cudf::io::STATISTICS_NONE},
124+
int32_t{cudf::io::ORC_STATISTICS_STRIPE},
125+
int32_t{cudf::io::ORC_STATISTICS_ROW_GROUP}}})
117126
->Unit(benchmark::kMillisecond)
118127
->UseManualTime();

cpp/include/cudf/io/orc.hpp

+62-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -434,6 +434,18 @@ table_with_metadata read_orc(
434434
*/
435435
class orc_writer_options_builder;
436436

437+
/**
438+
* @brief Constants to disambiguate statistics terminology for ORC.
439+
*
440+
* ORC refers to its finest granularity of row-grouping as "row group",
441+
* which corresponds to Parquet "pages".
442+
* Similarly, ORC's "stripe" corresponds to a Parquet "row group".
443+
* The following constants disambiguate the terminology for the statistics
444+
* collected at each level.
445+
*/
446+
static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
447+
static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
448+
437449
/**
438450
* @brief Settings to use for `write_orc()`.
439451
*/
@@ -442,8 +454,8 @@ class orc_writer_options {
442454
sink_info _sink;
443455
// Specify the compression format to use
444456
compression_type _compression = compression_type::AUTO;
445-
// Enable writing column statistics
446-
bool _enable_statistics = true;
457+
// Specify frequency of statistics collection
458+
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
447459
// Maximum size of each stripe (unless smaller than a single row group)
448460
size_t _stripe_size_bytes = default_stripe_size_bytes;
449461
// Maximum number of rows in stripe (unless smaller than a single row group)
@@ -501,7 +513,15 @@ class orc_writer_options {
501513
/**
502514
* @brief Whether writing column statistics is enabled/disabled.
503515
*/
504-
[[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
516+
[[nodiscard]] bool is_enabled_statistics() const
517+
{
518+
return _stats_freq != statistics_freq::STATISTICS_NONE;
519+
}
520+
521+
/**
522+
* @brief Returns frequency of statistics collection.
523+
*/
524+
[[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
505525

506526
/**
507527
* @brief Returns maximum stripe size, in bytes.
@@ -550,11 +570,16 @@ class orc_writer_options {
550570
void set_compression(compression_type comp) { _compression = comp; }
551571

552572
/**
553-
* @brief Enable/Disable writing column statistics.
573+
* @brief Choose granularity of statistics collection.
554574
*
555-
* @param val Boolean value to enable/disable statistics.
575+
* The granularity can be set to:
576+
* - cudf::io::STATISTICS_NONE: No statistics are collected.
577+
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
578+
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
579+
*
580+
* @param val Frequency of statistics collection.
556581
*/
557-
void enable_statistics(bool val) { _enable_statistics = val; }
582+
void enable_statistics(statistics_freq val) { _stats_freq = val; }
558583

559584
/**
560585
* @brief Sets the maximum stripe size, in bytes.
@@ -647,14 +672,19 @@ class orc_writer_options_builder {
647672
}
648673

649674
/**
650-
* @brief Enable/Disable writing column statistics.
675+
* @brief Choose granularity of column statistics to be written
676+
*
677+
* The granularity can be set to:
678+
* - cudf::io::STATISTICS_NONE: No statistics are collected.
679+
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
680+
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
651681
*
652-
* @param val Boolean value to enable/disable.
682+
* @param val Level of statistics collection.
653683
* @return this for chaining.
654684
*/
655-
orc_writer_options_builder& enable_statistics(bool val)
685+
orc_writer_options_builder& enable_statistics(statistics_freq val)
656686
{
657-
options._enable_statistics = val;
687+
options._stats_freq = val;
658688
return *this;
659689
}
660690

@@ -775,8 +805,8 @@ class chunked_orc_writer_options {
775805
sink_info _sink;
776806
// Specify the compression format to use
777807
compression_type _compression = compression_type::AUTO;
778-
// Enable writing column statistics
779-
bool _enable_statistics = true;
808+
// Specify granularity of statistics collection
809+
statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
780810
// Maximum size of each stripe (unless smaller than a single row group)
781811
size_t _stripe_size_bytes = default_stripe_size_bytes;
782812
// Maximum number of rows in stripe (unless smaller than a single row group)
@@ -825,9 +855,9 @@ class chunked_orc_writer_options {
825855
[[nodiscard]] compression_type get_compression() const { return _compression; }
826856

827857
/**
828-
* @brief Whether writing column statistics is enabled/disabled.
858+
* @brief Returns granularity of statistics collection.
829859
*/
830-
[[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
860+
[[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
831861

832862
/**
833863
* @brief Returns maximum stripe size, in bytes.
@@ -871,11 +901,16 @@ class chunked_orc_writer_options {
871901
void set_compression(compression_type comp) { _compression = comp; }
872902

873903
/**
874-
* @brief Enable/Disable writing column statistics.
904+
* @brief Choose granularity of statistics collection
905+
*
906+
* The granularity can be set to:
907+
* - cudf::io::STATISTICS_NONE: No statistics are collected.
908+
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
909+
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
875910
*
876-
* @param val Boolean value to enable/disable.
911+
* @param val Frequency of statistics collection.
877912
*/
878-
void enable_statistics(bool val) { _enable_statistics = val; }
913+
void enable_statistics(statistics_freq val) { _stats_freq = val; }
879914

880915
/**
881916
* @brief Sets the maximum stripe size, in bytes.
@@ -958,14 +993,19 @@ class chunked_orc_writer_options_builder {
958993
}
959994

960995
/**
961-
* @brief Enable/Disable writing column statistics.
996+
* @brief Choose granularity of statistics collection
997+
*
998+
* The granularity can be set to:
999+
* - cudf::io::STATISTICS_NONE: No statistics are collected.
1000+
* - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
1001+
* - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
9621002
*
963-
* @param val Boolean value to enable/disable.
1003+
* @param val Frequency of statistics collection.
9641004
* @return this for chaining.
9651005
*/
966-
chunked_orc_writer_options_builder& enable_statistics(bool val)
1006+
chunked_orc_writer_options_builder& enable_statistics(statistics_freq val)
9671007
{
968-
options._enable_statistics = val;
1008+
options._stats_freq = val;
9691009
return *this;
9701010
}
9711011

cpp/src/io/orc/writer_impl.cu

+27-17
Original file line numberDiff line numberDiff line change
@@ -1063,15 +1063,15 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
10631063
}
10641064

10651065
writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
1066-
bool are_statistics_enabled,
1066+
statistics_freq stats_freq,
10671067
orc_table_view const& orc_table,
10681068
file_segmentation const& segmentation)
10691069
{
1070-
auto const num_rowgroup_blobs = segmentation.rowgroups.count();
1071-
auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns();
1072-
auto const num_file_blobs = orc_table.num_columns();
1073-
auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
1074-
1070+
auto const num_rowgroup_blobs = segmentation.rowgroups.count();
1071+
auto const num_stripe_blobs = segmentation.num_stripes() * orc_table.num_columns();
1072+
auto const num_file_blobs = orc_table.num_columns();
1073+
auto const num_stat_blobs = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
1074+
auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
10751075
if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }
10761076

10771077
hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1164,17 +1164,27 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
11641164

11651165
hostdevice_vector<uint8_t> blobs(
11661166
stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
1167-
gpu::orc_encode_statistics(
1168-
blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
1167+
// Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP".
1168+
auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP;
1169+
auto const num_skip = is_granularity_rowgroup ? 0 : num_rowgroup_blobs;
1170+
gpu::orc_encode_statistics(blobs.device_ptr(),
1171+
stat_merge.device_ptr(num_skip),
1172+
stat_chunks.data() + num_skip,
1173+
num_stat_blobs - num_skip,
1174+
stream);
11691175
stat_merge.device_to_host(stream);
11701176
blobs.device_to_host(stream, true);
11711177

1172-
std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
1173-
for (size_t i = 0; i < num_rowgroup_blobs; i++) {
1174-
auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
1175-
auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks;
1176-
rowgroup_blobs[i].assign(stat_begin, stat_end);
1177-
}
1178+
auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
1179+
if (not is_granularity_rowgroup) { return {}; }
1180+
std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
1181+
for (size_t i = 0; i < num_rowgroup_blobs; i++) {
1182+
auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
1183+
auto const stat_end = stat_begin + rowgroup_stat_merge[i].num_chunks;
1184+
rowgroup_blobs[i].assign(stat_begin, stat_end);
1185+
}
1186+
return rowgroup_blobs;
1187+
}();
11781188

11791189
std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
11801190
for (size_t i = 0; i < num_stripe_blobs; i++) {
@@ -1351,7 +1361,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
13511361
max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
13521362
row_index_stride{options.get_row_index_stride()},
13531363
compression_kind_(to_orc_compression(options.get_compression())),
1354-
enable_statistics_(options.is_enabled_statistics()),
1364+
stats_freq_(options.get_statistics_freq()),
13551365
single_write_mode(mode == SingleWriteMode::YES),
13561366
kv_meta(options.get_key_value_metadata()),
13571367
out_sink_(std::move(sink))
@@ -1372,7 +1382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
13721382
max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
13731383
row_index_stride{options.get_row_index_stride()},
13741384
compression_kind_(to_orc_compression(options.get_compression())),
1375-
enable_statistics_(options.is_enabled_statistics()),
1385+
stats_freq_(options.get_statistics_freq()),
13761386
single_write_mode(mode == SingleWriteMode::YES),
13771387
kv_meta(options.get_key_value_metadata()),
13781388
out_sink_(std::move(sink))
@@ -1954,7 +1964,7 @@ void writer::impl::write(table_view const& table)
19541964

19551965
ProtobufWriter pbw_(&buffer_);
19561966

1957-
auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation);
1967+
auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
19581968

19591969
// Write stripes
19601970
std::vector<std::future<void>> write_tasks;

cpp/src/io/orc/writer_impl.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,13 @@ class writer::impl {
293293
/**
294294
* @brief Returns column statistics encoded in ORC protobuf format.
295295
*
296-
* @param are_statistics_enabled True if statistics are to be included in the output file
296+
* @param statistics_freq Frequency of statistics to be included in the output file
297297
* @param orc_table Table information to be written
298298
* @param columns List of columns
299299
* @param segmentation stripe and rowgroup ranges
300300
* @return The statistic blobs
301301
*/
302-
encoded_statistics gather_statistic_blobs(bool are_statistics_enabled,
302+
encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq,
303303
orc_table_view const& orc_table,
304304
file_segmentation const& segmentation);
305305

@@ -365,8 +365,8 @@ class writer::impl {
365365
size_t compression_blocksize_ = DEFAULT_COMPRESSION_BLOCKSIZE;
366366
CompressionKind compression_kind_ = CompressionKind::NONE;
367367

368-
bool enable_dictionary_ = true;
369-
bool enable_statistics_ = true;
368+
bool enable_dictionary_ = true;
369+
statistics_freq stats_freq_ = ORC_STATISTICS_ROW_GROUP;
370370

371371
// Overall file metadata. Filled in during the process and written during write_chunked_end()
372372
cudf::io::orc::FileFooter ff;

java/src/main/native/src/TableJni.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1733,7 +1733,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
17331733
chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
17341734
.metadata(&metadata)
17351735
.compression(static_cast<compression_type>(j_compression))
1736-
.enable_statistics(true)
1736+
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
17371737
.key_value_metadata(kv_metadata)
17381738
.build();
17391739
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
@@ -1776,7 +1776,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
17761776
chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
17771777
.metadata(&metadata)
17781778
.compression(static_cast<compression_type>(j_compression))
1779-
.enable_statistics(true)
1779+
.enable_statistics(ORC_STATISTICS_ROW_GROUP)
17801780
.key_value_metadata(kv_metadata)
17811781
.build();
17821782
auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);

python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
22

33
import io
44
import sys
@@ -74,7 +74,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
7474
data_handle=OrcWriter,
7575
params={
7676
"compression": [None, "snappy"],
77-
"enable_statistics": [True, False],
77+
"enable_statistics": ["NONE", "STRIPE", "ROWGROUP"],
7878
},
7979
)
8080
def orc_writer_test(pdf, compression, enable_statistics):

0 commit comments

Comments
 (0)