Skip to content

Commit

Permalink
ARROW-736: Mixed-type object DataFrame columns should not silently co…
Browse files Browse the repository at this point in the history
…erce to an Arrow type by default
  • Loading branch information
cpcloud committed Mar 30, 2017
1 parent 4938d8d commit da635ce
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 15 deletions.
4 changes: 2 additions & 2 deletions cpp/src/arrow/ipc/ipc-read-write-benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const
int32_t metadata_length;
int64_t body_length;
if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
default_memory_pool())
default_memory_pool())
.ok()) {
state.SkipWithError("Failed to write!");
}
Expand All @@ -101,7 +101,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r
int32_t metadata_length;
int64_t body_length;
if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length,
default_memory_pool())
default_memory_pool())
.ok()) {
state.SkipWithError("Failed to write!");
}
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/arrow/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,6 @@ install(FILES
if (ARROW_BUILD_TESTS)
ADD_ARROW_TEST(pandas-test
STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}")
ADD_ARROW_TEST(builtin-convert-test
STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}")
endif()
60 changes: 60 additions & 0 deletions cpp/src/arrow/python/builtin-convert-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "gtest/gtest.h"

#include <memory>

#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/python/builtin_convert.h"
#include "arrow/table.h"
#include "arrow/test-util.h"

namespace arrow {
namespace py {

TEST(BuiltinConversionTest, TestMixedTypeFails) {
PyAcquireGIL lock;
MemoryPool* pool = default_memory_pool();
std::shared_ptr<Array> arr;

PyObject* list = PyList_New(3);
ASSERT_NE(list, nullptr);

PyObject* str = PyUnicode_FromString("abc");
ASSERT_NE(str, nullptr);

PyObject* integer = PyLong_FromLong(1234L);
ASSERT_NE(integer, nullptr);

PyObject* doub = PyFloat_FromDouble(123.0234);
ASSERT_NE(doub, nullptr);

// This steals a reference to each object, so we don't need to decref them later
// just the list
ASSERT_EQ(PyList_SetItem(list, 0, str), 0);
ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);

ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr));

Py_DECREF(list);
}

} // namespace py
} // namespace arrow
4 changes: 0 additions & 4 deletions cpp/src/arrow/python/pandas-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,12 @@

#include "gtest/gtest.h"

#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
#include "arrow/type.h"

#include "arrow/python/common.h"
#include "arrow/python/pandas_convert.h"
Expand Down
43 changes: 37 additions & 6 deletions cpp/src/arrow/python/pandas_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,10 @@ Status AppendObjectStrings(StringBuilder& string_builder, PyObject** objects,
*have_bytes = true;
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length));
} else {
} else if (PyObject_is_null(obj)) {
string_builder.AppendNull();
} else {
return InvalidConversion(obj, "string or bytes");
}
}

Expand Down Expand Up @@ -374,6 +376,28 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
return Status::OK();
}

Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
PyObject* type = PyObject_Type(obj);
RETURN_IF_PYERROR();

PyObject* type_name = PyObject_GetAttrString(type, "__name__");
if (PyErr_Occurred()) { Py_DECREF(type); }
RETURN_IF_PYERROR();

Py_ssize_t size;
const char* bytes = PyUnicode_AsUTF8AndSize(type_name, &size);
std::string cpp_type_name(bytes, size);

std::stringstream ss;
ss << "Python object of type " << cpp_type_name << " is not None and is not a "
<< expected_type_name << " object";
auto s = Status::TypeError(ss.str());

Py_DECREF(type_name);
Py_DECREF(type);
return s;
}

Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;

Expand All @@ -388,8 +412,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
if (PyDate_CheckExact(obj)) {
PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
date_builder.Append(PyDate_to_ms(pydate));
} else {
} else if (PyObject_is_null(obj)) {
date_builder.AppendNull();
} else {
return InvalidConversion(obj, "date");
}
}
return date_builder.Finish(out);
Expand Down Expand Up @@ -430,14 +456,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
memset(bitmap, 0, nbytes);

int64_t null_count = 0;
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
if (objects[i] == Py_True) {
obj = objects[i];
if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
BitUtil::SetBit(null_bitmap_data_, i);
} else if (objects[i] != Py_False) {
} else if (obj == Py_False) {
BitUtil::SetBit(null_bitmap_data_, i);
} else if (PyObject_is_null(obj)) {
++null_count;
} else {
BitUtil::SetBit(null_bitmap_data_, i);
return InvalidConversion(obj, "bool");
}
}

Expand Down Expand Up @@ -496,7 +526,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
} else if (PyDate_CheckExact(objects[i])) {
return ConvertDates(out);
} else {
return Status::TypeError("unhandled python type");
return InvalidConversion(
const_cast<PyObject*>(objects[i]), "string, bool, or date");
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/python/pandas_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ ARROW_EXPORT
Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);

ARROW_EXPORT
Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);

} // namespace py
} // namespace arrow

Expand Down
7 changes: 4 additions & 3 deletions cpp/src/arrow/util/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@ namespace arrow {
#define ARROW_LOG_INTERNAL(level) ::arrow::internal::CerrLog(level)
#define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level)

#define ARROW_CHECK(condition) \
(condition) ? 0 : ::arrow::internal::FatalLog(ARROW_FATAL) \
<< __FILE__ << __LINE__ << " Check failed: " #condition " "
#define ARROW_CHECK(condition) \
(condition) ? 0 \
: ::arrow::internal::FatalLog(ARROW_FATAL) \
<< __FILE__ << __LINE__ << " Check failed: " #condition " "

#ifdef NDEBUG
#define ARROW_DFATAL ARROW_WARNING
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,8 @@ def test_list_of_int(self):
assert arr.null_count == 1
assert arr.type == pyarrow.list_(pyarrow.int64())
assert arr.to_pylist() == data

def test_mixed_types_fails(self):
data = ['a', 1, 2.0]
with self.assertRaises(pyarrow.error.ArrowException):
pyarrow.from_pylist(data)
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,8 @@ def test_category(self):
]
for values in arrays:
self._check_array_roundtrip(values)

def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with self.assertRaises(A.error.ArrowException):
A.Table.from_pandas(data)

0 comments on commit da635ce

Please sign in to comment.