From da635ce8ab01607d47b0503acd4b898969c1853a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 30 Mar 2017 17:37:35 -0400 Subject: [PATCH] ARROW-736: Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default --- cpp/src/arrow/ipc/ipc-read-write-benchmark.cc | 4 +- cpp/src/arrow/python/CMakeLists.txt | 2 + cpp/src/arrow/python/builtin-convert-test.cc | 60 +++++++++++++++++++ cpp/src/arrow/python/pandas-test.cc | 4 -- cpp/src/arrow/python/pandas_convert.cc | 43 +++++++++++-- cpp/src/arrow/python/pandas_convert.h | 3 + cpp/src/arrow/util/logging.h | 7 ++- python/pyarrow/tests/test_convert_builtin.py | 5 ++ python/pyarrow/tests/test_convert_pandas.py | 5 ++ 9 files changed, 118 insertions(+), 15 deletions(-) create mode 100644 cpp/src/arrow/python/builtin-convert-test.cc diff --git a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc index 1aecdbc633190..b385929d8b10a 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-benchmark.cc @@ -80,7 +80,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const int32_t metadata_length; int64_t body_length; if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - default_memory_pool()) + default_memory_pool()) .ok()) { state.SkipWithError("Failed to write!"); } @@ -101,7 +101,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r int32_t metadata_length; int64_t body_length; if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - default_memory_pool()) + default_memory_pool()) .ok()) { state.SkipWithError("Failed to write!"); } diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 03f5afc624b34..9f26a502ad926 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -90,4 +90,6 @@ install(FILES if (ARROW_BUILD_TESTS) ADD_ARROW_TEST(pandas-test STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") + ADD_ARROW_TEST(builtin-convert-test + STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") endif() diff --git a/cpp/src/arrow/python/builtin-convert-test.cc b/cpp/src/arrow/python/builtin-convert-test.cc new file mode 100644 index 0000000000000..0c6e272ccf711 --- /dev/null +++ b/cpp/src/arrow/python/builtin-convert-test.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/python/builtin_convert.h" +#include "arrow/table.h" +#include "arrow/test-util.h" + +namespace arrow { +namespace py { + +TEST(BuiltinConversionTest, TestMixedTypeFails) { + PyAcquireGIL lock; + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + + PyObject* list = PyList_New(3); + ASSERT_NE(list, nullptr); + + PyObject* str = PyUnicode_FromString("abc"); + ASSERT_NE(str, nullptr); + + PyObject* integer = PyLong_FromLong(1234L); + ASSERT_NE(integer, nullptr); + + PyObject* doub = PyFloat_FromDouble(123.0234); + ASSERT_NE(doub, nullptr); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, str), 0); + ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); + ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); + + ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr)); + + Py_DECREF(list); +} + +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc index a4e640b83718b..58eaf323be55d 100644 --- a/cpp/src/arrow/python/pandas-test.cc +++ b/cpp/src/arrow/python/pandas-test.cc @@ -17,16 +17,12 @@ #include "gtest/gtest.h" -#include #include -#include -#include #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" -#include "arrow/type.h" #include "arrow/python/common.h" #include "arrow/python/pandas_convert.h" diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index db2e90eb8b0ff..d5db524af6346 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -167,8 +167,10 @@ Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects, *have_bytes = true; const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); - } else { + } else if (PyObject_is_null(obj)) { string_builder.AppendNull(); + } else { + return InvalidConversion(obj, "string or bytes"); } } @@ -374,6 +376,28 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* return Status::OK(); } +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { + PyObject* type = PyObject_Type(obj); + RETURN_IF_PYERROR(); + + PyObject* type_name = PyObject_GetAttrString(type, "__name__"); + if (PyErr_Occurred()) { Py_DECREF(type); } + RETURN_IF_PYERROR(); + + Py_ssize_t size; + const char* bytes = PyUnicode_AsUTF8AndSize(type_name, &size); + std::string cpp_type_name(bytes, size); + + std::stringstream ss; + ss << "Python object of type " << cpp_type_name << " is not None and is not a " + << expected_type_name << " object"; + auto s = Status::TypeError(ss.str()); + + Py_DECREF(type_name); + Py_DECREF(type); + return s; +} + Status PandasConverter::ConvertDates(std::shared_ptr* out) { PyAcquireGIL lock; @@ -388,8 +412,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr* out) { if (PyDate_CheckExact(obj)) { PyDateTime_Date* pydate = reinterpret_cast(obj); date_builder.Append(PyDate_to_ms(pydate)); - } else { + } else if (PyObject_is_null(obj)) { date_builder.AppendNull(); + } else { + return InvalidConversion(obj, "date"); } } return date_builder.Finish(out); @@ -430,14 +456,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr* out) { memset(bitmap, 0, nbytes); int64_t null_count = 0; + PyObject* obj; for (int64_t i = 0; i < length_; ++i) { - if (objects[i] == Py_True) { + obj = objects[i]; + if (obj == Py_True) { BitUtil::SetBit(bitmap, i); BitUtil::SetBit(null_bitmap_data_, i); - } else if (objects[i] != Py_False) { + } else if (obj == Py_False) { + BitUtil::SetBit(null_bitmap_data_, i); + } else if (PyObject_is_null(obj)) { ++null_count; } else { - BitUtil::SetBit(null_bitmap_data_, i); + return InvalidConversion(obj, "bool"); } } @@ -496,7 +526,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr* out) { } else if (PyDate_CheckExact(objects[i])) { return ConvertDates(out); } else { - return Status::TypeError("unhandled python type"); + return InvalidConversion( + const_cast(objects[i]), "string, bool, or date"); } } } diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h index 12644d98da156..047b5adce1811 100644 --- a/cpp/src/arrow/python/pandas_convert.h +++ b/cpp/src/arrow/python/pandas_convert.h @@ -73,6 +73,9 @@ ARROW_EXPORT Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr& type, std::shared_ptr* out); +ARROW_EXPORT +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name); + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index b22f07dd6345f..697d47c541003 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -38,9 +38,10 @@ namespace arrow { #define ARROW_LOG_INTERNAL(level) ::arrow::internal::CerrLog(level) #define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level) -#define ARROW_CHECK(condition) \ - (condition) ? 0 : ::arrow::internal::FatalLog(ARROW_FATAL) \ - << __FILE__ << __LINE__ << " Check failed: " #condition " " +#define ARROW_CHECK(condition) \ + (condition) ? 0 \ + : ::arrow::internal::FatalLog(ARROW_FATAL) \ + << __FILE__ << __LINE__ << " Check failed: " #condition " " #ifdef NDEBUG #define ARROW_DFATAL ARROW_WARNING diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 7915f9766bf67..6ef17478e88d3 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -144,3 +144,8 @@ def test_list_of_int(self): assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data + + def test_mixed_types_fails(self): + data = ['a', 1, 2.0] + with self.assertRaises(pyarrow.error.ArrowException): + pyarrow.from_pylist(data) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index ea7a892a6f2a4..be6a5d5927b40 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -381,3 +381,8 @@ def test_category(self): ] for values in arrays: self._check_array_roundtrip(values) + + def test_mixed_types_fails(self): + data = pd.DataFrame({'a': ['a', 1, 2.0]}) + with self.assertRaises(A.error.ArrowException): + A.Table.from_pandas(data)