eerhardt commented on a change in pull request #10527:
URL: https://github.com/apache/arrow/pull/10527#discussion_r666462335
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
##########
@@ -30,6 +30,13 @@ internal abstract class ArrowReaderImplementation :
IDisposable
public Schema Schema { get; protected set; }
protected bool HasReadSchema => Schema != null;
+ private protected DictionaryMemo _dictionaryMemo;
+ private protected DictionaryMemo DictionaryMemo => _dictionaryMemo ??=
new DictionaryMemo();
+
+ public ArrowReaderImplementation()
+ {
+ }
+
Review comment:
This is unnecessary. It can be removed
```suggestion
```
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,6 +45,42 @@ protected override void Dispose(bool disposing)
}
}
+ private void ReadInitialDictionaries()
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (HasCreatedDictionaryMemo)
+ {
+ int fieldCount = DictionaryMemo.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
+ {
+ ReadArrowObject();
+ }
+ }
+ HasReadInitialDictionary = true;
+ }
+
+ private async ValueTask ReadInitialDictionariesAsync(CancellationToken
cancellationToken)
Review comment:
(nit) the other methods in this file have the `Async` method first, and
then the synchronous method after it. Can this go above
`ReadInitialDictionaries()`?
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,6 +45,42 @@ protected override void Dispose(bool disposing)
}
}
+ private void ReadInitialDictionaries()
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (HasCreatedDictionaryMemo)
+ {
+ int fieldCount = DictionaryMemo.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
+ {
+ ReadArrowObject();
+ }
+ }
+ HasReadInitialDictionary = true;
+ }
+
+ private async ValueTask ReadInitialDictionariesAsync(CancellationToken
cancellationToken)
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (HasCreatedDictionaryMemo)
+ {
+ int fieldCount = DictionaryMemo.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
Review comment:
```suggestion
for (int i = 0; i < fieldCount; i++)
```
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,6 +45,42 @@ protected override void Dispose(bool disposing)
}
}
+ private void ReadInitialDictionaries()
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (HasCreatedDictionaryMemo)
+ {
+ int fieldCount = DictionaryMemo.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
Review comment:
(nit) - the coding style is to use `i++` everywhere. Can we be
consistent here and below?
```suggestion
for (int i = 0; i < fieldCount; i++)
```
##########
File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
##########
@@ -73,13 +73,27 @@ internal static Schema GetSchema(Flatbuf.Schema schema)
return new Schema(fields, metadata, copyCollections: false);
}
- private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField)
+ private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField,
ref DictionaryMemo dictionaryMemo)
{
Field[] childFields = flatbufField.ChildrenLength > 0 ? new
Field[flatbufField.ChildrenLength] : null;
for (int i = 0; i < flatbufField.ChildrenLength; i++)
{
Flatbuf.Field? childFlatbufField = flatbufField.Children(i);
- childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value);
+ childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value,
ref dictionaryMemo);
+ }
+
+ Flatbuf.DictionaryEncoding? dictionaryEncoding =
flatbufField.Dictionary;
+ IArrowType type = GetFieldArrowType(flatbufField, childFields);
+
+ if (dictionaryEncoding.HasValue)
+ {
+ Flatbuf.Int? indexTypeAsInt =
dictionaryEncoding.Value.IndexType;
+ if (!indexTypeAsInt.HasValue)
Review comment:
The format doc says
> If this field is null, the indices must be signed int32.
https://github.com/apache/arrow/blob/0219e9a198b201df852b4219816752b36f116825/format/Schema.fbs#L328-L333
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -59,6 +97,75 @@ protected async ValueTask<RecordBatch>
ReadRecordBatchAsync(CancellationToken ca
{
await ReadSchemaAsync().ConfigureAwait(false);
+ await
ReadInitialDictionariesAsync(cancellationToken).ConfigureAwait(false);
+
+ return await
ReadArrowObjectAsync(cancellationToken).ConfigureAwait(false);
+ }
+
+
+ protected RecordBatch ReadRecordBatch()
+ {
+ ReadSchema();
+
+ ReadInitialDictionaries();
+
+ return ReadArrowObject();
Review comment:
The
[spec](https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format)
says
> DictionaryBatch and RecordBatch messages may be interleaved
This appears to only support DictionaryBatches above RecordBatches. If we
read all the "initial" dictionaries, and then read a RecordBatch, the next time
`ReadRecordBatch()` is called, if there is a DictionaryBatch, `null` will be
returned. Is that intentional?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]