Skip to content

Commit

Permalink
ARROW-736: [Python] Mixed-type object DataFrame columns should not si…
Browse files Browse the repository at this point in the history
…lently co…

…erce to an Arrow type by default

Author: Phillip Cloud <[email protected]>

Closes apache#465 from cpcloud/ARROW-736 and squashes the following commits:

fd09def [Phillip Cloud] Update cmake
bcf6236 [Phillip Cloud] Rename and move
4a18014 [Phillip Cloud] Move test
e80efe1 [Phillip Cloud] Use OwnedRef instead of horror
b2df3e9 [Phillip Cloud] Fix python error handling and make compatible with python27
84d33b4 [Phillip Cloud] ARROW-736: Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default
  • Loading branch information
cpcloud authored and leifwalsh committed Apr 1, 2017
1 parent 1964ea1 commit 7b55570
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 12 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,6 @@ install(FILES
# INSTALL_RPATH "\$ORIGIN")

if (ARROW_BUILD_TESTS)
ADD_ARROW_TEST(pandas-test
ADD_ARROW_TEST(python-test
STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}")
endif()
52 changes: 45 additions & 7 deletions cpp/src/arrow/python/pandas_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"

namespace arrow {
Expand Down Expand Up @@ -167,8 +168,10 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
*have_bytes = true;
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
} else if (PyObject_is_null(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else {
builder->AppendNull();
return InvalidConversion(obj, "string or bytes");
}
}

Expand Down Expand Up @@ -197,8 +200,10 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width
RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
RETURN_NOT_OK(
builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
} else if (PyObject_is_null(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else {
builder->AppendNull();
return InvalidConversion(obj, "string or bytes");
}
}

Expand Down Expand Up @@ -413,6 +418,32 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
return Status::OK();
}

Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
OwnedRef type(PyObject_Type(obj));
RETURN_IF_PYERROR();
DCHECK_NE(type.obj(), nullptr);

OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
RETURN_IF_PYERROR();
DCHECK_NE(type_name.obj(), nullptr);

OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj()));
RETURN_IF_PYERROR();
DCHECK_NE(bytes_obj.obj(), nullptr);

Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj());
const char* bytes = PyBytes_AS_STRING(bytes_obj.obj());

DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";

std::string cpp_type_name(bytes, size);

std::stringstream ss;
ss << "Python object of type " << cpp_type_name << " is not None and is not a "
<< expected_type_name << " object";
return Status::TypeError(ss.str());
}

Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;

Expand All @@ -427,8 +458,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
if (PyDate_CheckExact(obj)) {
PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
date_builder.Append(PyDate_to_ms(pydate));
} else {
} else if (PyObject_is_null(obj)) {
date_builder.AppendNull();
} else {
return InvalidConversion(obj, "date");
}
}
return date_builder.Finish(out);
Expand Down Expand Up @@ -483,14 +516,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
memset(bitmap, 0, nbytes);

int64_t null_count = 0;
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
if (objects[i] == Py_True) {
obj = objects[i];
if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
BitUtil::SetBit(null_bitmap_data_, i);
} else if (objects[i] != Py_False) {
} else if (obj == Py_False) {
BitUtil::SetBit(null_bitmap_data_, i);
} else if (PyObject_is_null(obj)) {
++null_count;
} else {
BitUtil::SetBit(null_bitmap_data_, i);
return InvalidConversion(obj, "bool");
}
}

Expand Down Expand Up @@ -551,7 +588,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
} else if (PyDate_CheckExact(objects[i])) {
return ConvertDates(out);
} else {
return Status::TypeError("unhandled python type");
return InvalidConversion(
const_cast<PyObject*>(objects[i]), "string, bool, or date");
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/python/pandas_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <Python.h>

#include <memory>
#include <string>

#include "arrow/util/visibility.h"

Expand Down Expand Up @@ -73,6 +74,9 @@ ARROW_EXPORT
Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);

ARROW_EXPORT
Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);

} // namespace py
} // namespace arrow

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@

#include "gtest/gtest.h"

#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include <Python.h>

#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
#include "arrow/type.h"

#include "arrow/python/common.h"
#include "arrow/python/pandas_convert.h"
#include "arrow/python/builtin_convert.h"

namespace arrow {
namespace py {
Expand Down Expand Up @@ -65,5 +64,33 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) {
Py_END_ALLOW_THREADS;
}

TEST(BuiltinConversionTest, TestMixedTypeFails) {
PyAcquireGIL lock;
MemoryPool* pool = default_memory_pool();
std::shared_ptr<Array> arr;

OwnedRef list_ref(PyList_New(3));
PyObject* list = list_ref.obj();

ASSERT_NE(list, nullptr);

PyObject* str = PyUnicode_FromString("abc");
ASSERT_NE(str, nullptr);

PyObject* integer = PyLong_FromLong(1234L);
ASSERT_NE(integer, nullptr);

PyObject* doub = PyFloat_FromDouble(123.0234);
ASSERT_NE(doub, nullptr);

// This steals a reference to each object, so we don't need to decref them later
// just the list
ASSERT_EQ(PyList_SetItem(list, 0, str), 0);
ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);

ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr));
}

} // namespace py
} // namespace arrow
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,8 @@ def test_list_of_int(self):
assert arr.null_count == 1
assert arr.type == pyarrow.list_(pyarrow.int64())
assert arr.to_pylist() == data

def test_mixed_types_fails(self):
data = ['a', 1, 2.0]
with self.assertRaises(pyarrow.error.ArrowException):
pyarrow.from_pylist(data)
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,8 @@ def test_category(self):
]
for values in arrays:
self._check_array_roundtrip(values)

def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with self.assertRaises(A.error.ArrowException):
A.Table.from_pandas(data)

0 comments on commit 7b55570

Please sign in to comment.