eerhardt commented on a change in pull request #10527: URL: https://github.com/apache/arrow/pull/10527#discussion_r666462335
########## File path: csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs ########## @@ -30,6 +30,13 @@ internal abstract class ArrowReaderImplementation : IDisposable public Schema Schema { get; protected set; } protected bool HasReadSchema => Schema != null; + private protected DictionaryMemo _dictionaryMemo; + private protected DictionaryMemo DictionaryMemo => _dictionaryMemo ??= new DictionaryMemo(); + + public ArrowReaderImplementation() + { + } + Review comment: This is unnecessary. It can be removed ```suggestion ``` ########## File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs ########## @@ -43,6 +45,42 @@ protected override void Dispose(bool disposing) } } + private void ReadInitialDictionaries() + { + if (HasReadInitialDictionary) + { + return; + } + + if (HasCreatedDictionaryMemo) + { + int fieldCount = DictionaryMemo.GetFieldCount(); + for (int i = 0; i < fieldCount; ++i) + { + ReadArrowObject(); + } + } + HasReadInitialDictionary = true; + } + + private async ValueTask ReadInitialDictionariesAsync(CancellationToken cancellationToken) Review comment: (nit) the other methods in this file have the `Async` method first, and then the synchronous method after it. Can this go above `ReadInitialDictionaries()`? ########## File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs ########## @@ -43,6 +45,42 @@ protected override void Dispose(bool disposing) } } + private void ReadInitialDictionaries() + { + if (HasReadInitialDictionary) + { + return; + } + + if (HasCreatedDictionaryMemo) + { + int fieldCount = DictionaryMemo.GetFieldCount(); + for (int i = 0; i < fieldCount; ++i) + { + ReadArrowObject(); + } + } + HasReadInitialDictionary = true; + } + + private async ValueTask ReadInitialDictionariesAsync(CancellationToken cancellationToken) + { + if (HasReadInitialDictionary) + { + return; + } + + if (HasCreatedDictionaryMemo) + { + int fieldCount = DictionaryMemo.GetFieldCount(); + for (int i = 0; i < fieldCount; ++i) Review comment: ```suggestion for (int i = 0; i < fieldCount; i++) ``` ########## File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs ########## @@ -43,6 +45,42 @@ protected override void Dispose(bool disposing) } } + private void ReadInitialDictionaries() + { + if (HasReadInitialDictionary) + { + return; + } + + if (HasCreatedDictionaryMemo) + { + int fieldCount = DictionaryMemo.GetFieldCount(); + for (int i = 0; i < fieldCount; ++i) Review comment: (nit) - the coding style is to use `i++` everywhere. Can we be consistent here and below? ```suggestion for (int i = 0; i < fieldCount; i++) ``` ########## File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs ########## @@ -73,13 +73,27 @@ internal static Schema GetSchema(Flatbuf.Schema schema) return new Schema(fields, metadata, copyCollections: false); } - private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField) + private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField, ref DictionaryMemo dictionaryMemo) { Field[] childFields = flatbufField.ChildrenLength > 0 ? new Field[flatbufField.ChildrenLength] : null; for (int i = 0; i < flatbufField.ChildrenLength; i++) { Flatbuf.Field? childFlatbufField = flatbufField.Children(i); - childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value); + childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value, ref dictionaryMemo); + } + + Flatbuf.DictionaryEncoding? dictionaryEncoding = flatbufField.Dictionary; + IArrowType type = GetFieldArrowType(flatbufField, childFields); + + if (dictionaryEncoding.HasValue) + { + Flatbuf.Int? indexTypeAsInt = dictionaryEncoding.Value.IndexType; + if (!indexTypeAsInt.HasValue) Review comment: The format doc says > If this field is null, the indices must be signed int32. https://github.com/apache/arrow/blob/0219e9a198b201df852b4219816752b36f116825/format/Schema.fbs#L328-L333 ########## File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs ########## @@ -59,6 +97,75 @@ protected async ValueTask<RecordBatch> ReadRecordBatchAsync(CancellationToken ca { await ReadSchemaAsync().ConfigureAwait(false); + await ReadInitialDictionariesAsync(cancellationToken).ConfigureAwait(false); + + return await ReadArrowObjectAsync(cancellationToken).ConfigureAwait(false); + } + + + protected RecordBatch ReadRecordBatch() + { + ReadSchema(); + + ReadInitialDictionaries(); + + return ReadArrowObject(); Review comment: The [spec](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format) says > DictionaryBatch and RecordBatch messages may be interleaved This appears to only support DictionaryBatches above RecordBatches. If we read all the "initial" dictionaries, and then read a RecordBatch, the next time `ReadRecordBatch()` is called, if there is a DictionaryBatch, `null` will be returned. Is that intentional? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org