Skip to content

Commit

Permalink
fix: use utf8 over c data interface
Browse files Browse the repository at this point in the history
  • Loading branch information
wjones127 committed Feb 28, 2023
1 parent c8d0c52 commit d4aff85
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 18 deletions.
31 changes: 18 additions & 13 deletions csharp/src/Apache.Arrow/C/CArrowSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ namespace Apache.Arrow.C
[StructLayout(LayoutKind.Sequential)]
unsafe public struct CArrowSchema
{
[MarshalAs(UnmanagedType.LPStr)]
public string format;
[MarshalAs(UnmanagedType.LPStr)]
public string name;
public IntPtr format;
public IntPtr name;
public IntPtr metadata;
public long flags;
public long n_children;
Expand Down Expand Up @@ -125,8 +123,8 @@ private static IntPtr ConstructDictionary(IArrowType datatype)
/// <param name="datatype">The Arrow type to export.</param>
public CArrowSchema(IArrowType datatype)
{
format = GetFormat(datatype);
name = null;
format = StringUtil.ToCStringUtf8(GetFormat(datatype));
name = IntPtr.Zero;
metadata = IntPtr.Zero;
flags = GetFlags(datatype);

Expand All @@ -138,6 +136,11 @@ public CArrowSchema(IArrowType datatype)
release = (IntPtr self) =>
{
var schema = Marshal.PtrToStructure<CArrowSchema>(self);

Marshal.FreeHGlobal(schema.format);
Marshal.FreeHGlobal(schema.name);
Marshal.FreeHGlobal(schema.metadata);

if (schema.n_children > 0)
{
for (int i = 0; i < schema.n_children; i++)
Expand All @@ -163,7 +166,7 @@ public CArrowSchema(IArrowType datatype)
/// <param name="field">Field to export.</param>
public CArrowSchema(Field field) : this(field.DataType)
{
name = field.Name;
name = StringUtil.ToCStringUtf8(field.Name);
// TODO: field metadata
metadata = IntPtr.Zero;
flags = GetFlags(field.DataType, field.IsNullable);
Expand Down Expand Up @@ -423,9 +426,10 @@ public void Dispose()

public ArrowType GetAsType()
{
var format = StringUtil.PtrToStringUtf8(_data.format);
if (_data.dictionary != IntPtr.Zero)
{
ArrowType indicesType = _data.format switch
ArrowType indicesType = format switch
{
"c" => new Int8Type(),
"C" => new UInt8Type(),
Expand All @@ -435,7 +439,7 @@ public ArrowType GetAsType()
"I" => new UInt32Type(),
"l" => new Int64Type(),
"L" => new UInt64Type(),
_ => throw new InvalidDataException($"Indices must be an integer, but got format string {_data.format}"),
_ => throw new InvalidDataException($"Indices must be an integer, but got format string {format}"),
};

var dictionarySchema = new ImportedArrowSchema(_data.dictionary, /*is_root*/ false);
Expand All @@ -447,7 +451,7 @@ public ArrowType GetAsType()
}

// Special handling for nested types
if (_data.format == "+l")
if (format == "+l")
{
if (_data.n_children != 1)
{
Expand All @@ -467,7 +471,7 @@ public ArrowType GetAsType()

return new ListType(childField);
}
else if (_data.format == "+s")
else if (format == "+s")
{
var child_schemas = new ImportedArrowSchema[_data.n_children];
unsafe
Expand All @@ -489,7 +493,7 @@ public ArrowType GetAsType()
}
// TODO: Map type and large list type

return _data.format switch
return format switch
{
// Primitives
"n" => new NullType(),
Expand Down Expand Up @@ -530,7 +534,8 @@ public ArrowType GetAsType()

public Field GetAsField()
{
string fieldName = string.IsNullOrEmpty(_data.name) ? "" : _data.name;
string name = StringUtil.PtrToStringUtf8(_data.name);
string fieldName = string.IsNullOrEmpty(name) ? "" : name;

bool nullable = (_data.flags & CArrowSchema.ArrowFlagNullable) == CArrowSchema.ArrowFlagNullable;

Expand Down
15 changes: 10 additions & 5 deletions csharp/test/Apache.Arrow.Tests/CDataInterfaceSchemaTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ private static Schema GetTestSchema()
.Field(f => f.Name("dict_string_ordered").DataType(new DictionaryType(Int32Type.Default, StringType.Default, true)).Nullable(false))
.Field(f => f.Name("list_dict_string").DataType(new ListType(new DictionaryType(Int32Type.Default, StringType.Default, false))).Nullable(false))

// Checking wider characters.
.Field(f => f.Name("hello 你好 😄").DataType(BooleanType.Default).Nullable(true))

.Build();
return schema;
}
Expand Down Expand Up @@ -115,6 +118,8 @@ private static IEnumerable<dynamic> GetPythonFields()
yield return pa.field("dict_string", pa.dictionary(pa.int32(), pa.utf8(), false), false);
yield return pa.field("dict_string_ordered", pa.dictionary(pa.int32(), pa.utf8(), true), false);
yield return pa.field("list_dict_string", pa.list_(pa.dictionary(pa.int32(), pa.utf8(), false)), false);

yield return pa.field("hello 你好 😄", pa.bool_(), true);
}
}

Expand Down Expand Up @@ -226,10 +231,10 @@ public void ExportType()
Assert.True(exportedPyType == expectedPyType);
}

// Python should have called release once `exported_py_type` went out-of-scope.
// Python should have called release once `exportedPyType` went out-of-scope.
var cSchema = Marshal.PtrToStructure<CArrowSchema>(exportedPtr);
Assert.Null(cSchema.release);
Assert.Null(cSchema.format);
Assert.Equal(IntPtr.Zero, cSchema.format);
Assert.Equal(0, cSchema.flags);
Assert.Equal(0, cSchema.n_children);
Assert.Equal(IntPtr.Zero, cSchema.dictionary);
Expand Down Expand Up @@ -260,11 +265,11 @@ public void ExportField()
Assert.True(exportedPyField == pyField);
}

// Python should have called release once `exported_py_type` went out-of-scope.
// Python should have called release once `exportedPyField` went out-of-scope.
var ffiSchema = Marshal.PtrToStructure<CArrowSchema>(exportedPtr);
Assert.Null(ffiSchema.name);
Assert.Equal(IntPtr.Zero, ffiSchema.name);
Assert.Null(ffiSchema.release);
Assert.Null(ffiSchema.format);
Assert.Equal(IntPtr.Zero, ffiSchema.format);

// Since we allocated, we are responsible for freeing the pointer.
CArrowSchema.FreePtr(exportedPtr);
Expand Down

0 comments on commit d4aff85

Please sign in to comment.