CurtHagenlocher commented on code in PR #35496: URL: https://github.com/apache/arrow/pull/35496#discussion_r1222401359
########## csharp/src/Apache.Arrow/C/CArrowArrayStreamImporter.cs: ########## @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +using System; +using System.Threading; +using System.Threading.Tasks; +using Apache.Arrow.Ipc; + +namespace Apache.Arrow.C +{ + public static class CArrowArrayStreamImporter + { + /// <summary> + /// Import C pointer as an <see cref="IArrowArrayStream"/>. + /// </summary> + /// <remarks> + /// This will call the release callback on the passed struct if the function fails. + /// Otherwise, the release callback is called when the IArrowArrayStream is disposed. + /// </remarks> + /// <examples> + /// Typically, you will allocate an uninitialized CArrowArrayStream pointer, + /// pass that to external function, and then use this method to import + /// the result. + /// + /// <code> + /// CArrowArrayStream* importedPtr = CArrowArrayStream.Create(); + /// foreign_export_function(importedPtr); + /// IArrowArrayStream importedStream = CArrowArrayStreamImporter.ImportStream(importedPtr); + /// </code> + /// </examples> + public static unsafe IArrowArrayStream ImportArrayStream(CArrowArrayStream* ptr) + { + return new ImportedArrowArrayStream(ptr); + } + + private sealed unsafe class ImportedArrowArrayStream : IArrowArrayStream + { + private readonly CArrowArrayStream* _cArrayStream; + private readonly Schema _schema; + private bool _disposed; + + public ImportedArrowArrayStream(CArrowArrayStream* cArrayStream) + { + if (cArrayStream == null) + { + throw new ArgumentNullException(nameof(cArrayStream)); + } + _cArrayStream = cArrayStream; + if (_cArrayStream->release == null) + { + throw new ArgumentException("Tried to import an array stream that has already been released.", nameof(cArrayStream)); + } + + CArrowSchema* cSchema = CArrowSchema.Create(); + try + { + int errno = _cArrayStream->get_schema(_cArrayStream, cSchema); + if (errno != 0) + { + throw new Exception($"Unexpected error recieved from external stream. Errno: {errno}"); + } + _schema = CArrowSchemaImporter.ImportSchema(cSchema); + } + finally + { + if (_schema == null) + { + CArrowSchema.Free(cSchema); + } + } + } + + ~ImportedArrowArrayStream() + { + Dispose(); + } + + public Schema Schema => _schema; + + public ValueTask<RecordBatch> ReadNextRecordBatchAsync(CancellationToken cancellationToken = default) + { + if (_disposed) + { + throw new ObjectDisposedException(typeof(ImportedArrowArrayStream).Name); + } + + RecordBatch result = null; + CArrowArray* cArray = CArrowArray.Create(); Review Comment: Okay, my pattern matching was a bit too eager for the first problem. Yes, that looks like a leak and I think your suggestion is right; that on this code path the ImportedArrowArray needs to remember that it owns the allocation. The second problem feels different, because in the first case we always know that we were the ones who allocated the CArrowArray but I'm not entirely sure we know that about the CArrowArrayStream. It's true for the stream we got from Python in the test case, but that was using a pyarrow-specific API. The flavor of the C API as a whole does suggest that it will usually be the case for the caller to have to allocate the CArrowArrayStream without (I think) quite making it explicit. On the whole, I suspect that both importers should take a flag which says whether or not to deallocate the structure afterwards. I'm not convinced about the right default for the flag given the relative risks of leaking memory vs deallocating it inappopriately. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
