Skip to content

Commit 4e44d5d

Browse files
authored
Large strings support in cudf::concatenate (#15195)
Enables `cudf::concatenate` to create and return a large strings column (offsets are INT64). This also introduces the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable and utilities around it. One internal utility checks the value so appropriate logic can either throw an overflow exception or build INT64 offsets as appropriate. The `cudf::test::large_strings_enabler` is introduced to set/unset the env var for individual tests are needed. A follow on PR will attempt to consolidate these kinds of tests with a specialized test fixture using this utility class. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: #15195
1 parent d7b8fc4 commit 4e44d5d

File tree

6 files changed

+142
-38
lines changed

6 files changed

+142
-38
lines changed

cpp/include/cudf/strings/detail/utilities.hpp

+27
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,24 @@ namespace cudf {
2727
namespace strings {
2828
namespace detail {
2929

30+
/**
31+
* @brief Create an offsets column to be a child of a strings column
32+
*
33+
* This will return the properly typed column to be filled in by the caller
34+
* given the number of bytes to address.
35+
*
36+
* @param chars_bytes Number of bytes for the chars in the strings column
37+
* @param count Number of elements for the offsets column.
38+
* This is the number of rows in the parent strings column +1.
39+
* @param stream CUDA stream used for device memory operations and kernel launches
40+
* @param mr Device memory resource used to allocate the returned column's device memory
41+
* @return The offsets child column for a strings column
42+
*/
43+
std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
44+
size_type count,
45+
rmm::cuda_stream_view stream,
46+
rmm::mr::device_memory_resource* mr);
47+
3048
/**
3149
* @brief Creates a string_view vector from a strings column.
3250
*
@@ -52,6 +70,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
5270
*/
5371
int64_t get_offset64_threshold();
5472

73+
/**
74+
* @brief Checks if large strings is enabled
75+
*
76+
* This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
77+
*
78+
* @return true if large strings are supported
79+
*/
80+
bool is_large_strings_enabled();
81+
5582
/**
5683
* @brief Return a normalized offset value from a strings offsets column
5784
*

cpp/include/cudf_test/column_utilities.hpp

+25
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,29 @@ template <>
210210
std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
211211
//! @endcond
212212

213+
/**
214+
* @brief For enabling large strings testing in specific tests
215+
*/
216+
struct large_strings_enabler {
217+
/**
218+
* @brief Create large strings enable object
219+
*
220+
* @param default_enable Default enables large strings support
221+
*/
222+
large_strings_enabler(bool default_enable = true);
223+
~large_strings_enabler();
224+
225+
/**
226+
* @brief Enable large strings support
227+
*/
228+
void enable();
229+
230+
/**
231+
* @brief Disable large strings support
232+
*/
233+
void disable();
234+
};
235+
213236
} // namespace cudf::test
214237

215238
// Macros for showing line of failure.
@@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
242265
SCOPED_TRACE(" <-- line of failure\n"); \
243266
cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
244267
} while (0)
268+
269+
#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___

cpp/src/strings/copying/concatenate.cu

+1-5
Original file line numberDiff line numberDiff line change
@@ -220,9 +220,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
220220
CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
221221
"total number of strings exceeds the column size limit",
222222
std::overflow_error);
223-
CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
224-
"total size of strings exceeds the column size limit",
225-
std::overflow_error);
226223

227224
bool const has_nulls =
228225
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
@@ -232,8 +229,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
232229
auto d_new_chars = output_chars.data();
233230

234231
// create output offsets column
235-
auto offsets_column = make_numeric_column(
236-
data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
232+
auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
237233
auto itr_new_offsets =
238234
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
239235

cpp/src/strings/utilities.cu

+33-2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cudf/detail/get_value.cuh>
2323
#include <cudf/strings/detail/char_tables.hpp>
2424
#include <cudf/strings/detail/utilities.cuh>
25+
#include <cudf/strings/detail/utilities.hpp>
2526
#include <cudf/utilities/error.hpp>
2627

2728
#include <rmm/cuda_stream_view.hpp>
@@ -31,6 +32,9 @@
3132
#include <thrust/iterator/counting_iterator.h>
3233
#include <thrust/transform.h>
3334

35+
#include <cstdlib>
36+
#include <string>
37+
3438
namespace cudf {
3539
namespace strings {
3640
namespace detail {
@@ -65,6 +69,27 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
6569
return strings_vector;
6670
}
6771

72+
/**
73+
* @copydoc cudf::strings::detail::create_offsets_child_column
74+
*/
75+
std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
76+
size_type count,
77+
rmm::cuda_stream_view stream,
78+
rmm::mr::device_memory_resource* mr)
79+
{
80+
auto const threshold = get_offset64_threshold();
81+
if (!is_large_strings_enabled()) {
82+
CUDF_EXPECTS(
83+
chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
84+
}
85+
return make_numeric_column(
86+
chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
87+
count,
88+
mask_state::UNALLOCATED,
89+
stream,
90+
mr);
91+
}
92+
6893
namespace {
6994
// The device variables are created here to avoid using a singleton that may cause issues
7095
// with RMM initialize/finalize. See PR #3159 for details on this approach.
@@ -123,13 +148,19 @@ special_case_mapping const* get_special_case_mapping_table()
123148

124149
int64_t get_offset64_threshold()
125150
{
126-
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
127-
std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
151+
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
152+
int64_t const rtn = threshold != nullptr ? std::atol(threshold) : 0L;
128153
return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
129154
? rtn
130155
: std::numeric_limits<int32_t>::max();
131156
}
132157

158+
bool is_large_strings_enabled()
159+
{
160+
auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
161+
return env != nullptr && std::string(env) == "1";
162+
}
163+
133164
int64_t get_offset_value(cudf::column_view const& offsets,
134165
size_type index,
135166
rmm::cuda_stream_view stream)

cpp/tests/copying/concatenate_tests.cpp

+45-31
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#include <cudf/table/table.hpp>
3333
#include <cudf/utilities/default_stream.hpp>
3434

35+
#include <thrust/iterator/constant_iterator.h>
36+
3537
#include <numeric>
3638
#include <stdexcept>
3739
#include <string>
@@ -164,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
164166
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
165167
}
166168

167-
TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
168-
{
169-
// Test large concatenate, causes out of bound device memory errors if kernel
170-
// indexing is not int64_t.
171-
// 1.5GB bytes, 5k columns
172-
constexpr size_t num_strings = 10000;
173-
constexpr size_t string_length = 150000;
174-
constexpr size_t strings_per_column = 2;
175-
constexpr size_t num_columns = num_strings / strings_per_column;
176-
177-
std::vector<std::string> strings;
178-
std::vector<char const*> h_strings;
179-
std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
180-
std::vector<cudf::column_view> strings_columns;
181-
182-
std::string s(string_length, 'a');
183-
for (size_t i = 0; i < num_strings; ++i)
184-
h_strings.push_back(s.data());
185-
186-
for (size_t i = 0; i < num_columns; ++i)
187-
strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
188-
h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
189-
for (auto& wrapper : strings_column_wrappers)
190-
strings_columns.push_back(wrapper);
191-
192-
auto results = cudf::concatenate(strings_columns);
193-
194-
cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
195-
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
196-
}
197-
198169
TEST_F(StringColumnTest, ConcatenateManyColumns)
199170
{
200171
std::vector<char const*> h_strings{
@@ -226,6 +197,49 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
226197
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
227198
}
228199

200+
TEST_F(StringColumnTest, ConcatenateLargeStrings)
201+
{
202+
CUDF_TEST_ENABLE_LARGE_STRINGS();
203+
auto itr = thrust::constant_iterator<std::string_view>(
204+
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
205+
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
206+
auto view = cudf::column_view(input);
207+
std::vector<cudf::column_view> input_cols;
208+
std::vector<cudf::size_type> splits;
209+
int const multiplier = 10;
210+
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
211+
input_cols.push_back(view);
212+
splits.push_back(view.size() * (i + 1));
213+
}
214+
splits.pop_back(); // remove last entry
215+
auto result = cudf::concatenate(input_cols);
216+
auto sv = cudf::strings_column_view(result->view());
217+
EXPECT_EQ(sv.size(), view.size() * multiplier);
218+
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
219+
220+
// verify results in sections
221+
auto sliced = cudf::split(result->view(), splits);
222+
for (auto c : sliced) {
223+
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
224+
}
225+
226+
// also test with large strings column as input
227+
{
228+
input_cols.clear();
229+
input_cols.push_back(input); // regular column
230+
input_cols.push_back(result->view()); // large column
231+
result = cudf::concatenate(input_cols);
232+
sv = cudf::strings_column_view(result->view());
233+
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
234+
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
235+
splits.push_back(view.size() * multiplier);
236+
sliced = cudf::split(result->view(), splits);
237+
for (auto c : sliced) {
238+
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
239+
}
240+
}
241+
}
242+
229243
struct TableTest : public cudf::test::BaseFixture {};
230244

231245
TEST_F(TableTest, ConcatenateTables)

cpp/tests/utilities/column_utilities.cu

+11
Original file line numberDiff line numberDiff line change
@@ -1011,5 +1011,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
10111011
return {std::move(host_data), bitmask_to_host(c)};
10121012
}
10131013

1014+
large_strings_enabler::large_strings_enabler(bool default_enable)
1015+
{
1016+
default_enable ? enable() : disable();
1017+
}
1018+
1019+
large_strings_enabler::~large_strings_enabler() { disable(); }
1020+
1021+
void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }
1022+
1023+
void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }
1024+
10141025
} // namespace test
10151026
} // namespace cudf

0 commit comments

Comments
 (0)