1
1
/*
2
- * Copyright (c) 2020-2021 , NVIDIA CORPORATION.
2
+ * Copyright (c) 2020-2022 , NVIDIA CORPORATION.
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
5
* you may not use this file except in compliance with the License.
@@ -434,6 +434,18 @@ table_with_metadata read_orc(
434
434
*/
435
435
class orc_writer_options_builder ;
436
436
437
+ /* *
438
+ * @brief Constants to disambiguate statistics terminology for ORC.
439
+ *
440
+ * ORC refers to its finest granularity of row-grouping as "row group",
441
+ * which corresponds to Parquet "pages".
442
+ * Similarly, ORC's "stripe" corresponds to a Parquet "row group".
443
+ * The following constants disambiguate the terminology for the statistics
444
+ * collected at each level.
445
+ */
446
+ static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP;
447
+ static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
448
+
437
449
/* *
438
450
* @brief Settings to use for `write_orc()`.
439
451
*/
@@ -442,8 +454,8 @@ class orc_writer_options {
442
454
sink_info _sink;
443
455
// Specify the compression format to use
444
456
compression_type _compression = compression_type::AUTO;
445
- // Enable writing column statistics
446
- bool _enable_statistics = true ;
457
+ // Specify frequency of statistics collection
458
+ statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP ;
447
459
// Maximum size of each stripe (unless smaller than a single row group)
448
460
size_t _stripe_size_bytes = default_stripe_size_bytes;
449
461
// Maximum number of rows in stripe (unless smaller than a single row group)
@@ -501,7 +513,15 @@ class orc_writer_options {
501
513
/* *
502
514
* @brief Whether writing column statistics is enabled/disabled.
503
515
*/
504
- [[nodiscard]] bool is_enabled_statistics () const { return _enable_statistics; }
516
+ [[nodiscard]] bool is_enabled_statistics () const
517
+ {
518
+ return _stats_freq != statistics_freq::STATISTICS_NONE;
519
+ }
520
+
521
+ /* *
522
+ * @brief Returns frequency of statistics collection.
523
+ */
524
+ [[nodiscard]] statistics_freq get_statistics_freq () const { return _stats_freq; }
505
525
506
526
/* *
507
527
* @brief Returns maximum stripe size, in bytes.
@@ -550,11 +570,16 @@ class orc_writer_options {
550
570
void set_compression (compression_type comp) { _compression = comp; }
551
571
552
572
/* *
553
- * @brief Enable/Disable writing column statistics.
573
+ * @brief Choose granularity of statistics collection .
554
574
*
555
- * @param val Boolean value to enable/disable statistics.
575
+ * The granularity can be set to:
576
+ * - cudf::io::STATISTICS_NONE: No statistics are collected.
577
+ * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
578
+ * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
579
+ *
580
+ * @param val Frequency of statistics collection.
556
581
*/
557
- void enable_statistics (bool val) { _enable_statistics = val; }
582
+ void enable_statistics (statistics_freq val) { _stats_freq = val; }
558
583
559
584
/* *
560
585
* @brief Sets the maximum stripe size, in bytes.
@@ -647,14 +672,19 @@ class orc_writer_options_builder {
647
672
}
648
673
649
674
/* *
650
- * @brief Enable/Disable writing column statistics.
675
+ * @brief Choose granularity of column statistics to be written
676
+ *
677
+ * The granularity can be set to:
678
+ * - cudf::io::STATISTICS_NONE: No statistics are collected.
679
+ * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
680
+ * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
651
681
*
652
- * @param val Boolean value to enable/disable .
682
+ * @param val Level of statistics collection .
653
683
* @return this for chaining.
654
684
*/
655
- orc_writer_options_builder& enable_statistics (bool val)
685
+ orc_writer_options_builder& enable_statistics (statistics_freq val)
656
686
{
657
- options._enable_statistics = val;
687
+ options._stats_freq = val;
658
688
return *this ;
659
689
}
660
690
@@ -775,8 +805,8 @@ class chunked_orc_writer_options {
775
805
sink_info _sink;
776
806
// Specify the compression format to use
777
807
compression_type _compression = compression_type::AUTO;
778
- // Enable writing column statistics
779
- bool _enable_statistics = true ;
808
+ // Specify granularity of statistics collection
809
+ statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP ;
780
810
// Maximum size of each stripe (unless smaller than a single row group)
781
811
size_t _stripe_size_bytes = default_stripe_size_bytes;
782
812
// Maximum number of rows in stripe (unless smaller than a single row group)
@@ -825,9 +855,9 @@ class chunked_orc_writer_options {
825
855
[[nodiscard]] compression_type get_compression () const { return _compression; }
826
856
827
857
/* *
828
- * @brief Whether writing column statistics is enabled/disabled .
858
+ * @brief Returns granularity of statistics collection .
829
859
*/
830
- [[nodiscard]] bool is_enabled_statistics () const { return _enable_statistics ; }
860
+ [[nodiscard]] statistics_freq get_statistics_freq () const { return _stats_freq ; }
831
861
832
862
/* *
833
863
* @brief Returns maximum stripe size, in bytes.
@@ -871,11 +901,16 @@ class chunked_orc_writer_options {
871
901
void set_compression (compression_type comp) { _compression = comp; }
872
902
873
903
/* *
874
- * @brief Enable/Disable writing column statistics.
904
+ * @brief Choose granularity of statistics collection
905
+ *
906
+ * The granularity can be set to:
907
+ * - cudf::io::STATISTICS_NONE: No statistics are collected.
908
+ * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
909
+ * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
875
910
*
876
- * @param val Boolean value to enable/disable .
911
+ * @param val Frequency of statistics collection .
877
912
*/
878
- void enable_statistics (bool val) { _enable_statistics = val; }
913
+ void enable_statistics (statistics_freq val) { _stats_freq = val; }
879
914
880
915
/* *
881
916
* @brief Sets the maximum stripe size, in bytes.
@@ -958,14 +993,19 @@ class chunked_orc_writer_options_builder {
958
993
}
959
994
960
995
/* *
961
- * @brief Enable/Disable writing column statistics.
996
+ * @brief Choose granularity of statistics collection
997
+ *
998
+ * The granularity can be set to:
999
+ * - cudf::io::STATISTICS_NONE: No statistics are collected.
1000
+ * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
1001
+ * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
962
1002
*
963
- * @param val Boolean value to enable/disable .
1003
+ * @param val Frequency of statistics collection .
964
1004
* @return this for chaining.
965
1005
*/
966
- chunked_orc_writer_options_builder& enable_statistics (bool val)
1006
+ chunked_orc_writer_options_builder& enable_statistics (statistics_freq val)
967
1007
{
968
- options._enable_statistics = val;
1008
+ options._stats_freq = val;
969
1009
return *this ;
970
1010
}
971
1011
0 commit comments