Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-33856: [C#] Implement C Data Interface for C# #35496

Merged
merged 14 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ public static IArrowArray BuildArray(ArrayData data)
{
switch (data.DataType.TypeId)
{
case ArrowTypeId.Null:
return new NullArray(data);
case ArrowTypeId.Boolean:
return new BooleanArray(data);
case ArrowTypeId.UInt8:
Expand Down
42 changes: 42 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/NullArray.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

using Apache.Arrow.Types;

namespace Apache.Arrow
{
public class NullArray : IArrowArray
{
public ArrayData Data { get; }

public NullArray(ArrayData data)
{
data.EnsureDataType(ArrowTypeId.Null);
data.EnsureBufferCount(0);
}

public int Length => Data.Length;

public int Offset => Data.Offset;

public int NullCount => Data.NullCount;

public void Dispose() { }
public bool IsNull(int index) => true;
public bool IsValid(int index) => false;

public void Accept(IArrowArrayVisitor visitor) => throw new System.NotImplementedException();
}
}
19 changes: 19 additions & 0 deletions csharp/src/Apache.Arrow/ArrowBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,24 @@ public void Dispose()
{
_memoryOwner?.Dispose();
}

internal bool TryExport(ExportedAllocationOwner newOwner, out IntPtr ptr)
{
if (_memoryOwner == null && IsEmpty)
{
ptr = IntPtr.Zero;
return true;
}

if (_memoryOwner is IOwnableAllocation ownable && ownable.TryAcquire(out ptr, out int offset, out int length))
{
newOwner.Acquire(ptr, offset, length);
ptr += offset;
return true;
}

ptr = IntPtr.Zero;
return false;
}
}
}
84 changes: 84 additions & 0 deletions csharp/src/Apache.Arrow/C/CArrowArray.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

using System;
using System.Runtime.InteropServices;

namespace Apache.Arrow.C
{
/// <summary>
/// An Arrow C Data Interface Schema, which represents the data in an exported array or record batch.
/// </summary>
/// <remarks>
/// This is used to export <see cref="RecordBatch"/> or <see cref="IArrowArray"/> to other languages. It matches
/// the layout of the ArrowArray struct described in https://github.com/apache/arrow/blob/main/cpp/src/arrow/c/abi.h.
/// </remarks>
[StructLayout(LayoutKind.Sequential)]
public unsafe struct CArrowArray
{
public long length;
public long null_count;
public long offset;
public long n_buffers;
public long n_children;
public byte** buffers;
public CArrowArray** children;
public CArrowArray* dictionary;
public delegate* unmanaged[Stdcall]<CArrowArray*, void> release;
public void* private_data;

/// <summary>
/// Allocate and zero-initialize an unmanaged pointer of this type.
/// </summary>
/// <remarks>
/// This pointer must later be freed by <see cref="Free"/>.
/// </remarks>
public static CArrowArray* Create()
{
var ptr = (CArrowArray*)Marshal.AllocHGlobal(sizeof(CArrowArray));

ptr->length = 0;
ptr->n_buffers = 0;
ptr->offset = 0;
ptr->buffers = null;
ptr->n_children = 0;
ptr->children = null;
ptr->dictionary = null;
ptr->null_count = 0;
ptr->release = null;
ptr->private_data = null;

return ptr;
}

/// <summary>
/// Free a pointer that was allocated in <see cref="Create"/>.
/// </summary>
/// <remarks>
/// Do not call this on a pointer that was allocated elsewhere.
/// </remarks>
public static void Free(CArrowArray* schema)
{
if (schema->release != null)
{
// Call release if not already called.
schema->release(schema);
}
Marshal.FreeHGlobal((IntPtr)schema);
}
}
}
210 changes: 210 additions & 0 deletions csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


using System;
using System.Runtime.InteropServices;
using Apache.Arrow.Memory;

namespace Apache.Arrow.C
{
public static class CArrowArrayExporter
{
private unsafe delegate void ReleaseArrowArray(CArrowArray* cArray);

/// <summary>
/// Export an <see cref="IArrowArray"/> to a <see cref="CArrowArray"/>. Whether or not the
/// export succeeds, the original array becomes invalid. Clone an array to continue using it
/// after a copy has been exported.
/// </summary>
/// <param name="array">The array to export</param>
/// <param name="cArray">An allocated but uninitialized CArrowArray pointer.</param>
/// <example>
/// <code>
/// CArrowArray* exportPtr = CArrowArray.Create();
/// CArrowArrayExporter.ExportArray(array, exportPtr);
/// foreign_import_function(exportPtr);
/// </code>
/// </example>
public static unsafe void ExportArray(IArrowArray array, CArrowArray* cArray)
{
if (array == null)
{
throw new ArgumentNullException(nameof(array));
}
if (cArray == null)
{
throw new ArgumentNullException(nameof(cArray));
}
if (cArray->release != null)
{
throw new ArgumentException("Cannot export array to a struct that is already initialized.", nameof(cArray));
}
Comment on lines +52 to +55
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't mandate this, since the user can call this with a local uninitialized struct ArrowArray variable (if called from raw C rather than, say, Python).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was following the existing pattern in CArrowSchemaExporter. But I also think that the documentation, in saying that "A released structure is indicated by setting its release callback to NULL. Before reading and interpreting a structure’s data, consumers SHOULD check for a NULL release callback and treat it accordingly (probably by erroring out)." suggests that this check is appropriate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then CArrowSchemaExporter should probably be modified as well.

This is producer code, not consumer code. At this point, the structure is still uninitialized. "Uninitialized" in the C (or C++) sense, that is "may contain any arbitrary bytes", not "zero-initialized".

Producer code therefore shouldn't care about what is already in the structure.

cc @paleolimbot

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of why C is awful ;).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed as part of #35996.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry I missed this and I see that it's been solved. In nanoarrow we definitely assume that pointer output arguments point to uninitialized memory (and strive to not touch that memory until failure is impossible). There are a few places where we do something like

struct ArrowArray tmp;
tmp.release = NULL;
// stuff with tmp that might fail
if (had_error) {
  if (tmp.release != NULL) {
    tmp.release(&tmp);
  }

  return;
}

ArrowArrayMove(&tmp, out);
return;

...to simplify (to the extent that anything in C is simple) the error handling.


ExportedAllocationOwner allocationOwner = new ExportedAllocationOwner();
try
{
ConvertArray(allocationOwner, array.Data, cArray);
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)Marshal.GetFunctionPointerForDelegate<ReleaseArrowArray>(ReleaseArray);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do this another way. If you add [UnmanagedCallersOnly] to the private unsafe static void ReleaseArray(CArrowArray* cArray) method, then this line should just be:

Suggested change
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)Marshal.GetFunctionPointerForDelegate<ReleaseArrowArray>(ReleaseArray);
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)&ReleaseArray;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This applies for all the function pointers we need to set on these structs.

Copy link
Contributor Author

@CurtHagenlocher CurtHagenlocher May 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would still work correctly if an array were exported via the C API to another bit of managed code which consumed it (via the C API)?

Edited: the bigger problem is that I would still like to support netstandard20, where UnmanagedCallersOnly isn't available. Is it worth using conditional compilation to optimize for .NET 5+?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would still work correctly if an array were exported via the C API to another bit of managed code which consumed it (via the C API)?

Yes because it is the release pointer is an unmanaged function pointer, and not a managed delegate. It is a bit hard to explain. But even though the caller is "managed" (i.e. .NET code), the way the method is invoked is through an unmanged function pointer. So it is still an "UnmanagedCaller".

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth using conditional compilation to optimize for .NET 5+?

Maybe trying writing a benchmark test in https://github.com/apache/arrow/tree/main/csharp/test/Apache.Arrow.Benchmarks to compare the difference? If we see major differences, it may make sense to split the compilation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, looking at this deeper, the current code may have problems since the managed Delegate might be GC'd.

See https://learn.microsoft.com/en-us/dotnet/api/system.runtime.interopservices.marshal.getfunctionpointerfordelegate?view=net-8.0

You must manually keep the delegate from being collected by the garbage collector from managed code. The garbage collector does not track references to unmanaged code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you'd think I'd have remembered this given that I pointed it out in a private email last month :/.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I've taken care of the lifetime issues and would probably file a work item to test for optimization.

cArray->private_data = FromDisposable(allocationOwner);
allocationOwner = null;
}
finally
{
allocationOwner?.Dispose();
}
}

/// <summary>
/// Export a <see cref="RecordBatch"/> to a <see cref="CArrowArray"/>. Whether or not the
/// export succeeds, the original record batch becomes invalid. Clone the batch to continue using it
/// after a copy has been exported.
/// </summary>
/// <param name="batch">The record batch to export</param>
/// <param name="cArray">An allocated but uninitialized CArrowArray pointer.</param>
/// <example>
/// <code>
/// CArrowArray* exportPtr = CArrowArray.Create();
/// CArrowArrayExporter.ExportRecordBatch(batch, exportPtr);
/// foreign_import_function(exportPtr);
/// </code>
/// </example>
public static unsafe void ExportRecordBatch(RecordBatch batch, CArrowArray* cArray)
{
if (batch == null)
{
throw new ArgumentNullException(nameof(batch));
}
if (cArray == null)
{
throw new ArgumentNullException(nameof(cArray));
}
if (cArray->release != null)
{
throw new ArgumentException("Cannot export array to a struct that is already initialized.", nameof(cArray));
}

ExportedAllocationOwner allocationOwner = new ExportedAllocationOwner();
try
{
ConvertRecordBatch(allocationOwner, batch, cArray);
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)Marshal.GetFunctionPointerForDelegate<ReleaseArrowArray>(ReleaseArray);
cArray->private_data = FromDisposable(allocationOwner);
allocationOwner = null;
}
finally
{
allocationOwner?.Dispose();
}
}

private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, ArrayData array, CArrowArray* cArray)
{
cArray->length = array.Length;
cArray->offset = array.Offset;
cArray->null_count = array.NullCount;
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)Marshal.GetFunctionPointerForDelegate<ReleaseArrowArray>(ReleaseArray);
cArray->private_data = null;

cArray->n_buffers = array.Buffers?.Length ?? 0;
cArray->buffers = null;
if (cArray->n_buffers > 0)
{
cArray->buffers = (byte**)Marshal.AllocCoTaskMem(array.Buffers.Length * IntPtr.Size);
for (int i = 0; i < array.Buffers.Length; i++)
{
ArrowBuffer buffer = array.Buffers[i];
IntPtr ptr;
if (!buffer.TryExport(sharedOwner, out ptr))
{
throw new NotSupportedException(); // TODO
}
cArray->buffers[i] = (byte*)ptr;
}
}

cArray->n_children = array.Children?.Length ?? 0;
cArray->children = null;
if (cArray->n_children > 0)
{
cArray->children = (CArrowArray**)Marshal.AllocCoTaskMem(IntPtr.Size * array.Children.Length);
for (int i = 0; i < array.Children.Length; i++)
{
cArray->children[i] = CArrowArray.Create();
ConvertArray(sharedOwner, array.Children[i], cArray->children[i]);
}
}

cArray->dictionary = null;
if (array.Dictionary != null)
{
cArray->dictionary = CArrowArray.Create();
ConvertArray(sharedOwner, array.Dictionary, cArray->dictionary);
}
}

private unsafe static void ConvertRecordBatch(ExportedAllocationOwner sharedOwner, RecordBatch batch, CArrowArray* cArray)
{
cArray->length = batch.Length;
cArray->offset = 0;
cArray->null_count = 0;
cArray->release = (delegate* unmanaged[Stdcall]<CArrowArray*, void>)Marshal.GetFunctionPointerForDelegate<ReleaseArrowArray>(ReleaseArray);
cArray->private_data = null;

cArray->n_buffers = 1;
cArray->buffers = (byte**)Marshal.AllocCoTaskMem(IntPtr.Size);

cArray->n_children = batch.ColumnCount;
cArray->children = null;
if (cArray->n_children > 0)
{
cArray->children = (CArrowArray**)Marshal.AllocCoTaskMem(IntPtr.Size * batch.ColumnCount);
int i = 0;
foreach (IArrowArray child in batch.Arrays)
{
cArray->children[i] = CArrowArray.Create();
ConvertArray(sharedOwner, child.Data, cArray->children[i]);
i++;
}
}

cArray->dictionary = null;
}

private unsafe static void ReleaseArray(CArrowArray* cArray)
{
if (cArray->private_data != null)
{
Dispose(&cArray->private_data);
}
cArray->private_data = null;
cArray->release = null;
}

private unsafe static void* FromDisposable(IDisposable disposable)
{
GCHandle gch = GCHandle.Alloc(disposable);
return (void*)GCHandle.ToIntPtr(gch);
}

private unsafe static void Dispose(void** ptr)
{
GCHandle gch = GCHandle.FromIntPtr((IntPtr)(*ptr));
((IDisposable)gch.Target).Dispose();
gch.Free();
*ptr = null;
}
}
}
Loading