diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eaf45f23..d41a74b79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,3 +7,5 @@ ## Improvements ## Bug Fixes + +- PR #16 `cuspatial::subset_trajectory_id` test improvements and bug fixes diff --git a/cpp/include/cuspatial/trajectory.hpp b/cpp/include/cuspatial/trajectory.hpp index 27fc953a1..018036ed9 100644 --- a/cpp/include/cuspatial/trajectory.hpp +++ b/cpp/include/cuspatial/trajectory.hpp @@ -16,7 +16,7 @@ #pragma once -typedef struct gdf_column_ gdf_column; // forward declaration +#include namespace cuspatial { @@ -116,17 +116,26 @@ void trajectory_spatial_bounds(const gdf_column& x, const gdf_column& y, * @param[out] out_timestamp: output timestamp ordered by (in_id,in_ts) * * @return number of trajectories returned + * + * @note the output columns are allocated by this function but they must + * be deallocated by the caller. + * + * @note this function is likely to be removed in the future since it is + * redundant to cuDF functionality * - * Note: the API is useful for integrating with cuDF and serial Python APIs, + * @note: the API is useful for integrating with cuDF and serial Python APIs, * e.g., query based on trajectory level information using serial Python APIs or * cuDF APIs and identify a subset of trajectory IDs. These IDs can then be used * to retrieve x/y/len/pos data for futher processing. */ -uint32_t subset_trajectory_id(const gdf_column& id, - const gdf_column& in_x, const gdf_column& in_y, - const gdf_column& in_id, - const gdf_column& in_timestamp, - gdf_column& out_x, gdf_column& out_y, - gdf_column& out_id, gdf_column& out_timestamp); +gdf_size_type subset_trajectory_id(const gdf_column& id, + const gdf_column& in_x, + const gdf_column& in_y, + const gdf_column& in_id, + const gdf_column& in_timestamp, + gdf_column& out_x, + gdf_column& out_y, + gdf_column& out_id, + gdf_column& out_timestamp); } // namespace cuspatial \ No newline at end of file diff --git a/cpp/src/trajectory/subset_trajectories.cu b/cpp/src/trajectory/subset_trajectories.cu index 8414a00bc..90baee485 100644 --- a/cpp/src/trajectory/subset_trajectories.cu +++ b/cpp/src/trajectory/subset_trajectories.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -45,72 +46,66 @@ struct subset_functor { } template () >* = nullptr> - uint32_t operator()(const gdf_column& id, - const gdf_column& in_x, const gdf_column& in_y, - const gdf_column& in_id, const gdf_column& in_timestamp, - gdf_column& out_x, gdf_column& out_y, - gdf_column& out_id, gdf_column& out_timestamp) + gdf_size_type operator()(const gdf_column& id, + const gdf_column& in_x, const gdf_column& in_y, + const gdf_column& in_id, + const gdf_column& in_timestamp, + gdf_column& out_x, gdf_column& out_y, + gdf_column& out_id, gdf_column& out_timestamp) { - T* in_x_ptr = static_cast(in_x.data); - T* in_y_ptr = static_cast(in_y.data); - uint32_t* in_id_ptr = static_cast(in_id.data); - cuspatial::its_timestamp* in_ts_ptr = - static_cast(in_timestamp.data); - uint32_t* id_ptr = static_cast(id.data); - - cudaStream_t stream{0}; - auto exec_policy = rmm::exec_policy(stream)->on(stream); - + gdf_size_type num_hit{0}; gdf_size_type num_id{id.size}; gdf_size_type num_rec{in_id.size}; - rmm::device_vector temp_id(id_ptr, id_ptr + num_id); - thrust::sort(exec_policy, temp_id.begin(), temp_id.end()); - thrust::device_vector hit_vec(num_rec); - thrust::binary_search(exec_policy, temp_id.cbegin(), temp_id.cend(), - in_id_ptr, in_id_ptr + num_rec, hit_vec.begin()); - - - uint32_t num_hit = thrust::count(exec_policy, hit_vec.begin(), - hit_vec.end(), true); - - RMM_TRY( RMM_ALLOC(&out_x.data, num_hit * sizeof(double), 0) ); - RMM_TRY( RMM_ALLOC(&out_y.data, num_hit * sizeof(double), 0) ); - RMM_TRY( RMM_ALLOC(&out_id.data, num_hit * sizeof(uint32_t), 0) ); - RMM_TRY( RMM_ALLOC(&out_timestamp.data, - num_hit * sizeof(cuspatial::its_timestamp), 0) ); - - T* out_x_ptr = static_cast(out_x.data); - T* out_y_ptr = static_cast(out_y.data); - uint32_t* out_id_ptr = static_cast(out_id.data); - cuspatial::its_timestamp* out_ts_ptr = - static_cast(out_timestamp.data); - - auto in_itr = thrust::make_zip_iterator(thrust::make_tuple(in_x_ptr, - in_y_ptr, - in_id_ptr, - in_ts_ptr)); - auto out_itr = thrust::make_zip_iterator(thrust::make_tuple(out_x_ptr, - out_y_ptr, - out_id_ptr, - out_ts_ptr)); - - uint32_t num_keep = thrust::copy_if(exec_policy, in_itr, in_itr+num_rec, - hit_vec.begin(), out_itr, - is_true()) - out_itr; - - CUDF_EXPECTS(num_hit == num_keep, - "count_if and copy_if result mismatch"); + if (num_id > 0 && id.data != nullptr && num_rec > 0) { + int32_t* in_id_ptr = static_cast(in_id.data); + int32_t* id_ptr = static_cast(id.data); + + cudaStream_t stream{0}; + auto exec_policy = rmm::exec_policy(stream)->on(stream); + + rmm::device_vector temp_id(id_ptr, id_ptr + num_id); + thrust::sort(exec_policy, temp_id.begin(), temp_id.end()); + thrust::device_vector hit_vec(num_rec); + thrust::binary_search(exec_policy, temp_id.cbegin(), temp_id.cend(), + in_id_ptr, in_id_ptr + num_rec, hit_vec.begin()); + + num_hit = thrust::count(exec_policy, hit_vec.begin(), + hit_vec.end(), true); + + if (num_hit > 0) { + out_x = cudf::allocate_like(in_x, num_hit); + out_y = cudf::allocate_like(in_y, num_hit); + out_id = cudf::allocate_like(in_id, num_hit); + out_timestamp = cudf::allocate_like(in_timestamp, num_hit); + + auto in_itr = thrust::make_zip_iterator(thrust::make_tuple( + static_cast(in_x.data), static_cast(in_y.data), + static_cast(in_id.data), + static_cast(in_timestamp.data))); + auto out_itr = thrust::make_zip_iterator(thrust::make_tuple( + static_cast(out_x.data), static_cast(out_y.data), + static_cast(out_id.data), + static_cast(out_timestamp.data))); + + auto end = thrust::copy_if(exec_policy, in_itr, in_itr + num_rec, + hit_vec.begin(), out_itr, is_true()); + gdf_size_type num_keep = end - out_itr; + + CUDF_EXPECTS(num_hit == num_keep, + "count_if and copy_if result mismatch"); + } + } return num_hit; } template () >* = nullptr> - uint32_t operator()(const gdf_column& ids, - const gdf_column& in_x, const gdf_column& in_y, - const gdf_column& in_id, const gdf_column& in_ts, - gdf_column& out_x, gdf_column& out_y, - gdf_column& out_id, gdf_column& out_ts) + gdf_size_type operator()(const gdf_column& ids, + const gdf_column& in_x, const gdf_column& in_y, + const gdf_column& in_id, const gdf_column& in_ts, + gdf_column& out_x, gdf_column& out_y, + gdf_column& out_id, gdf_column& out_ts) { CUDF_FAIL("Non-floating point operation is not supported"); } @@ -120,37 +115,40 @@ struct subset_functor { namespace cuspatial { -uint32_t subset_trajectory_id(const gdf_column& id, - const gdf_column& in_x, const gdf_column& in_y, - const gdf_column& in_id, - const gdf_column& in_timestamp, - gdf_column& out_x, gdf_column& out_y, - gdf_column& out_id, gdf_column& out_timestamp) -{ - struct timeval t0,t1; - gettimeofday(&t0, nullptr); - +gdf_size_type subset_trajectory_id(const gdf_column& id, + const gdf_column& in_x, + const gdf_column& in_y, + const gdf_column& in_id, + const gdf_column& in_timestamp, + gdf_column& out_x, + gdf_column& out_y, + gdf_column& out_id, + gdf_column& out_timestamp) +{ CUDF_EXPECTS(in_x.data != nullptr && in_x.data != nullptr && in_id.data != nullptr && in_timestamp.data != nullptr, - "x/y/in_id/in_ts data cannot be null"); + "Null input data"); CUDF_EXPECTS(in_x.size == in_y.size && in_x.size == in_id.size && - in_x.size == in_timestamp.size , - "x/y/in_id/timestamp must have equal size"); - - // future versions might allow x/y/in_id/timestamp to have null_count > 0, - // which might be useful for taking query results as inputs + in_x.size == in_timestamp.size, + "Data size mismatch"); + CUDF_EXPECTS(in_id.dtype == GDF_INT32, + "Invalid trajectory ID datatype"); + CUDF_EXPECTS(id.dtype == in_id.dtype, + "Trajectory ID datatype mismatch"); + CUDF_EXPECTS(in_timestamp.dtype == GDF_TIMESTAMP, + "Invalid timestamp datatype"); CUDF_EXPECTS(in_x.null_count == 0 && in_y.null_count == 0 && in_id.null_count==0 && in_timestamp.null_count==0, "NULL support unimplemented"); - - uint32_t num_hit=cudf::type_dispatcher(in_x.dtype, subset_functor(), id, - in_x, in_y, in_id, in_timestamp, - out_x, out_y, out_id, out_timestamp); - std::cout<<"number of resulting points:"< +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct TrajectorySubsetTest : public GdfTest +{ +}; + +template +using wrapper = cudf::test::column_wrapper; + +constexpr gdf_size_type column_size{1000}; + +void test_subset(std::vector ids_to_keep) +{ + std::vector sequence(column_size); + std::iota(sequence.begin(), sequence.end(), 0); + + //three sorted trajectories: one with 2/3 of the points, two with 1/6 + std::vector id_vector(column_size); + std::transform(sequence.cbegin(), sequence.cend(), id_vector.begin(), + [](int32_t i) { return (i < 2 * i / 3) ? 0 : + (i < 5 * i / 6) ? 1 : 2; }); + + // timestamp milliseconds + std::vector ms_vector(sequence.begin(), sequence.end()); + + //randomize sequence + std::seed_seq seed{0}; + std::mt19937 g(seed); + + std::shuffle(sequence.begin(), sequence.end(), g); + + wrapper in_x(column_size, + [&](gdf_index_type i) { return static_cast(sequence[i]); }); + wrapper in_y(column_size, + [&](gdf_index_type i) { return static_cast(sequence[i]); }); + wrapper in_id(column_size, + [&](gdf_index_type i) { return id_vector[sequence[i]]; }); + wrapper in_ts(column_size, + [&](gdf_index_type i) { + return static_cast(ms_vector[sequence[i]]); + }); + wrapper ids{ids_to_keep}; + + gdf_column out_x{}, out_y{}, out_id{}, out_ts{}; + + // sort the ids to keep now that we've copied them unsorted to input column + std::sort(ids_to_keep.begin(), ids_to_keep.end()); + + std::vector expected_sequence(sequence.size()); + auto expected_size = + std::copy_if(sequence.begin(), sequence.end(), expected_sequence.begin(), + [&](int32_t i) { + return std::binary_search(ids_to_keep.begin(), ids_to_keep.end(), + id_vector[i]); + } + ) - expected_sequence.begin(); + + wrapper expected_x(expected_size, + [&](gdf_index_type i) { return static_cast(expected_sequence[i]); }); + wrapper expected_y(expected_size, + [&](gdf_index_type i) { return static_cast(expected_sequence[i]); }); + wrapper expected_id(expected_size, + [&](gdf_index_type i) { return id_vector[expected_sequence[i]]; }); + wrapper expected_ts(expected_size, + [&](gdf_index_type i) { + return static_cast(ms_vector[expected_sequence[i]]); + }); + + gdf_size_type num_hit{0}; + + EXPECT_NO_THROW( + num_hit = cuspatial::subset_trajectory_id(ids, in_x, in_y, in_id, in_ts, + out_x, out_y, out_id, out_ts)); + + EXPECT_EQ(num_hit, expected_size); + EXPECT_TRUE(expected_x == out_x); + EXPECT_TRUE(expected_y == out_y); + EXPECT_TRUE(expected_id == out_id); + EXPECT_TRUE(expected_ts == out_ts); +} + +TEST_F(TrajectorySubsetTest, SelectSome) +{ + std::vector keep_all{0, 1, 2}; + test_subset(keep_all); + std::vector keep_all_unsorted{2, 0, 1}; + test_subset(keep_all_unsorted); + std::vector keep_two{1, 2}; + test_subset(keep_two); + std::vector keep_one{1}; + test_subset(keep_one); + std::vector keep_none{}; + test_subset(keep_none); +} + +TEST_F(TrajectorySubsetTest, BadData) +{ + //constexpr gdf_size_type column_size{1000}; + + gdf_column out_x, out_y, out_id, out_timestamp; + + gdf_column bad_x, bad_y, bad_in_id, bad_timestamp, bad_id; + gdf_column_view(&bad_id, 0, 0, 0, GDF_INT32); + gdf_column_view(&bad_x, 0, 0, 0, GDF_FLOAT64); + gdf_column_view(&bad_y, 0, 0, 0, GDF_FLOAT64); + gdf_column_view(&bad_in_id, 0, 0, 0, GDF_INT32); + gdf_column_view(&bad_timestamp, 0, 0, 0, GDF_TIMESTAMP); + + // null pointers + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "Null input data"); + + // size mismatch + bad_x.data = bad_y.data = bad_in_id.data = bad_timestamp.data = + reinterpret_cast(0x0badf00d); + bad_x.size = 10; + bad_y.size = 12; // mismatch + bad_in_id.size = 10; + bad_timestamp.size = 10; + + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "Data size mismatch"); + + // Invalid ID datatype + bad_y.size = 10; + bad_in_id.dtype = GDF_FLOAT32; + + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "Invalid trajectory ID datatype"); + + bad_in_id.dtype = GDF_INT32; + bad_id.dtype = GDF_INT8; + + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "Trajectory ID datatype mismatch"); + + bad_id.dtype = GDF_INT32; + bad_timestamp.dtype = GDF_DATE32; + + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "Invalid timestamp datatype"); + + bad_timestamp.dtype = GDF_TIMESTAMP; + bad_x.null_count = 5; + CUDF_EXPECT_THROW_MESSAGE(cuspatial::subset_trajectory_id(bad_id, + bad_x, bad_y, + bad_in_id, + bad_timestamp, + out_x, out_y, + out_id, + out_timestamp), + "NULL support unimplemented"); +} diff --git a/cpp/tests/trajectory/trajectory_subset_toy.cu b/cpp/tests/trajectory/trajectory_subset_toy.cu deleted file mode 100644 index d25808bb8..000000000 --- a/cpp/tests/trajectory/trajectory_subset_toy.cu +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -struct TrajectorySubsetToy : public GdfTest -{ -}; - -TEST_F(TrajectorySubsetToy, trajectorysubsettest) -{ - //three sorted trajectories with 5,4,3 points, respectively - std::cout<<"in TrajectorySubsetToy"< point_x_wrapp{rand_x}; - cudf::test::column_wrapper point_y_wrapp{rand_y}; - cudf::test::column_wrapper point_id_wrapp{rand_id}; - cudf::test::column_wrapper point_ts_wrapp{rand_ts}; - cudf::test::column_wrapper ids_wrapp{1,2}; - - gdf_column out_x,out_y,out_id,out_ts; - memset(&out_x,0,sizeof(gdf_column)); - memset(&out_y,0,sizeof(gdf_column)); - memset(&out_id,0,sizeof(gdf_column)); - memset(&out_ts,0,sizeof(gdf_column)); - - std::cout<<"calling cuspatial::subset_trajectory_id"< out_x_ptr=thrust::device_pointer_cast(static_cast(out_x.data)); - thrust::device_ptr out_y_ptr=thrust::device_pointer_cast(static_cast(out_y.data)); - thrust::device_ptr out_id_ptr=thrust::device_pointer_cast(static_cast(out_id.data)); - thrust::device_ptr out_ts_ptr=thrust::device_pointer_cast(static_cast(out_ts.data)); - - std::cout<<"x"<(std::cout, " "));std::cout<(std::cout, " "));std::cout<(std::cout, " "));std::cout<(std::cout, " "));std::cout<malloc(sizeof(gdf_column)) cdef gdf_column* c_len = malloc(sizeof(gdf_column)) cdef gdf_column* c_pos = malloc(sizeof(gdf_column)) - + with nogil: num_trajectories = derive_trajectories(c_x[0], c_y[0], c_object_id[0], @@ -27,7 +27,7 @@ cpdef cpp_derive_trajectories(x, y, object_id, timestamp): traj_id_mask) len = Column.from_mem_views(len_data, len_mask) pos = Column.from_mem_views(pos_data, pos_mask) - + return num_trajectories, trajectory_id, len, pos cpdef cpp_trajectory_distance_and_speed(x, y, timestamp, len, pos): @@ -47,31 +47,59 @@ cpdef cpp_trajectory_distance_and_speed(x, y, timestamp, len, pos): speed_data, speed_mask = gdf_column_to_column_mem(&c_distance_speed.second) dist=Column.from_mem_views(dist_data, dist_mask) speed=Column.from_mem_views(speed_data, speed_mask) - + return dist,speed cpdef cpp_trajectory_spatial_bounds(coor_x,coor_y,len,pos): - cdef gdf_column* c_coor_x = column_view_from_column(coor_x) - cdef gdf_column* c_coor_y = column_view_from_column(coor_y) - cdef gdf_column* c_len = column_view_from_column(len) - cdef gdf_column* c_pos = column_view_from_column(pos) - cdef gdf_column* c_x1 = malloc(sizeof(gdf_column)) - cdef gdf_column* c_x2 = malloc(sizeof(gdf_column)) - cdef gdf_column* c_y1 = malloc(sizeof(gdf_column)) - cdef gdf_column* c_y2 = malloc(sizeof(gdf_column)) - - with nogil: - trajectory_spatial_bounds(c_coor_x[0], c_coor_y[0], - c_len[0], c_pos[0], - c_x1[0], c_y1[0], c_x2[0], c_y2[0]) - - x1_data, x1_mask = gdf_column_to_column_mem(c_x1) - x1=Column.from_mem_views(x1_data,x1_mask) - x2_data, x2_mask = gdf_column_to_column_mem(c_x2) - x2=Column.from_mem_views(x2_data,x2_mask) - y1_data, y1_mask = gdf_column_to_column_mem(c_y1) - y1=Column.from_mem_views(y1_data,y1_mask) - y2_data, y2_mask = gdf_column_to_column_mem(c_y2) - y2=Column.from_mem_views(y2_data,y2_mask) + cdef gdf_column* c_coor_x = column_view_from_column(coor_x) + cdef gdf_column* c_coor_y = column_view_from_column(coor_y) + cdef gdf_column* c_len = column_view_from_column(len) + cdef gdf_column* c_pos = column_view_from_column(pos) + cdef gdf_column* c_x1 = malloc(sizeof(gdf_column)) + cdef gdf_column* c_x2 = malloc(sizeof(gdf_column)) + cdef gdf_column* c_y1 = malloc(sizeof(gdf_column)) + cdef gdf_column* c_y2 = malloc(sizeof(gdf_column)) - return x1,y1,x2,y2 + with nogil: + trajectory_spatial_bounds(c_coor_x[0], c_coor_y[0], + c_len[0], c_pos[0], + c_x1[0], c_y1[0], c_x2[0], c_y2[0]) + + x1_data, x1_mask = gdf_column_to_column_mem(c_x1) + x1 = Column.from_mem_views(x1_data,x1_mask) + x2_data, x2_mask = gdf_column_to_column_mem(c_x2) + x2 = Column.from_mem_views(x2_data,x2_mask) + y1_data, y1_mask = gdf_column_to_column_mem(c_y1) + y1 = Column.from_mem_views(y1_data,y1_mask) + y2_data, y2_mask = gdf_column_to_column_mem(c_y2) + y2 = Column.from_mem_views(y2_data,y2_mask) + + return x1, y1, x2, y2 + +cpdef cpp_subset_trajectory_id(id, in_x, in_y, in_id, in_timestamp): + cdef gdf_column* c_id = column_view_from_column(id) + cdef gdf_column* c_in_x = column_view_from_column(in_x) + cdef gdf_column* c_in_y = column_view_from_column(in_y) + cdef gdf_column* c_in_id = column_view_from_column(in_id) + cdef gdf_column* c_in_timestamp = column_view_from_column(in_timestamp) + + cdef gdf_column* c_out_x = malloc(sizeof(gdf_column)) + cdef gdf_column* c_out_y = malloc(sizeof(gdf_column)) + cdef gdf_column* c_out_id = malloc(sizeof(gdf_column)) + cdef gdf_column* c_out_timestamp = malloc(sizeof(gdf_column)) + + with nogil: + count = subset_trajectory_id(c_id[0], c_in_x[0], c_in_y[0], c_in_id[0], + c_in_timestamp[0], c_out_x[0], c_out_y[0], + c_out_id[0], c_out_timestamp[0]) + + x_data, x_mask = gdf_column_to_column_mem(c_out_x) + x = Column.from_mem_views(x_data,x_mask) + y_data, y_mask = gdf_column_to_column_mem(c_out_y) + y = Column.from_mem_views(y_data,y_mask) + id_data, id_mask = gdf_column_to_column_mem(c_out_id) + id = Column.from_mem_views(id_data,id_mask) + timestamp_data, timestamp_mask = gdf_column_to_column_mem(c_out_timestamp) + timestamp = Column.from_mem_views(timestamp_data, timestamp_mask) + + return x, y, id, timestamp \ No newline at end of file