eerhardt commented on a change in pull request #10527:
URL: https://github.com/apache/arrow/pull/10527#discussion_r662454808
##########
File path: csharp/src/Apache.Arrow/RecordBatch.cs
##########
@@ -28,6 +28,8 @@ public partial class RecordBatch : IDisposable
public IEnumerable<IArrowArray> Arrays => _arrays;
public int Length { get; }
+ internal IReadOnlyList<IArrowArray> _Arrays =>
(IReadOnlyList<IArrowArray>)_arrays;
+
private readonly IMemoryOwner<byte> _memoryOwner;
private readonly IList<IArrowArray> _arrays;
Review comment:
```suggestion
internal IReadOnlyList<IArrowArray> ArrayList => _arrays;
private readonly IMemoryOwner<byte> _memoryOwner;
private readonly List<IArrowArray> _arrays;
```
1. Using the coding style from
https://github.com/dotnet/runtime/blob/main/docs/coding-guidelines/coding-style.md,
only fields should have `_` prefix.
2. We shouldn't need to cast here. Just have the private field as a
`List<IArrowArray>`.
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -27,8 +27,10 @@ internal class ArrowStreamReaderImplementation :
ArrowReaderImplementation
public Stream BaseStream { get; }
private readonly bool _leaveOpen;
private readonly MemoryAllocator _allocator;
+ private protected bool HasReadInitialDictionary { get; set; }
- public ArrowStreamReaderImplementation(Stream stream, MemoryAllocator
allocator, bool leaveOpen)
+
+ public ArrowStreamReaderImplementation(Stream stream, MemoryAllocator
allocator, bool leaveOpen) : base()
Review comment:
```suggestion
public ArrowStreamReaderImplementation(Stream stream,
MemoryAllocator allocator, bool leaveOpen)
```
1. Needless extra line.
2. No need for the explicit call to base.
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,13 +45,48 @@ protected override void Dispose(bool disposing)
}
}
+ protected void ReadInitialDictionaries()
Review comment:
```suggestion
private void ReadInitialDictionaries()
```
##########
File path: csharp/test/Apache.Arrow.Tests/TestData.cs
##########
@@ -21,12 +21,13 @@ namespace Apache.Arrow.Tests
{
public static class TestData
{
- public static RecordBatch CreateSampleRecordBatch(int length)
+ //TODO: Remove the createDictionaryArray argument after all
writer/reader supports DictionaryType serialization
Review comment:
What's the plan for addressing this TODO?
##########
File path: csharp/test/Apache.Arrow.Tests/ArrowStreamReaderTests.cs
##########
@@ -68,7 +68,7 @@ public async Task Ctor_MemoryPool_AllocatesFromPool(bool
shouldLeaveOpen)
ArrowStreamReader reader = new ArrowStreamReader(stream,
memoryPool, shouldLeaveOpen);
reader.ReadNextRecordBatch();
- Assert.Equal(1, memoryPool.Statistics.Allocations);
+ Assert.Equal(2, memoryPool.Statistics.Allocations);
Review comment:
Why change this test at all? If you want to add a new test for
dictionaries - we should add a new one, and leave the existing test.
##########
File path: csharp/src/Apache.Arrow/Arrays/DictionaryArray.cs
##########
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.IO;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow
+{
+ public class DictionaryArray : Array
+ {
+ public IArrowArray Dictionary { get; }
+ public IArrowArray Indices { get; }
+ public ArrowBuffer IndicesBuffer => Data.Buffers[1];
+
+ public DictionaryArray(IArrowType dataType, int length,
+ ArrowBuffer valueOffsetsBuffer, IArrowArray value,
+ ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0)
+ : this(new ArrayData(dataType, length, nullCount, offset,
+ new[] { nullBitmapBuffer, valueOffsetsBuffer }, new[] {
value.Data }, value.Data.Dictionary))
+ {
+ }
+
+ public DictionaryArray(ArrayData data) : base(data)
+ {
+ data.EnsureBufferCount(2);
+ data.EnsureDataType(ArrowTypeId.Dictionary);
+
+ var dicType = data.DataType as DictionaryType;
+ data.Dictionary.EnsureDataType(dicType.ValueType.TypeId);
Review comment:
I don't see a null check added here.
##########
File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
##########
@@ -73,13 +73,27 @@ internal static Schema GetSchema(Flatbuf.Schema schema)
return new Schema(fields, metadata, copyCollections: false);
}
- private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField)
+ private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField,
LazyCreator<DictionaryMemo> lazyDictionaryMemo)
{
Field[] childFields = flatbufField.ChildrenLength > 0 ? new
Field[flatbufField.ChildrenLength] : null;
for (int i = 0; i < flatbufField.ChildrenLength; i++)
{
Flatbuf.Field? childFlatbufField = flatbufField.Children(i);
- childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value);
+ childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value,
lazyDictionaryMemo);
+ }
+
+ Flatbuf.DictionaryEncoding? de = flatbufField.Dictionary;
Review comment:
(nit) Can you spell out `de`? It doesn't make a great reading experience
to figure out what `de` means.
```suggestion
Flatbuf.DictionaryEncoding? dictionaryEncoding =
flatbufField.Dictionary;
```
##########
File path: csharp/test/Apache.Arrow.Tests/TestData.cs
##########
@@ -21,12 +21,13 @@ namespace Apache.Arrow.Tests
{
public static class TestData
{
- public static RecordBatch CreateSampleRecordBatch(int length)
+ //TODO: Remove the createDictionaryArray argument after all
writer/reader supports DictionaryType serialization
+ public static RecordBatch CreateSampleRecordBatch(int length, bool
createDictionaryArray = false)
Review comment:
A general comment about `createDictionaryArray` - I think the existing
tests should be left alone. That way we are still testing as much as we can
without any dictionaries. Then we create new tests for dictionary support.
An easy way to do this is to change a bunch of tests from `[Fact]` to
`[Theory]` and pass in `true` and `false` into the test for `bool
createDictoinaryArray`. That way the test runs twice, once with a dictionary,
and once without. That should give us the best test coverage.
##########
File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
##########
@@ -52,14 +53,13 @@ public static Types.NumberType GetNumberType(int bitWidth,
bool signed)
$"{(signed ? "signed " : "unsigned")}
integer.");
}
- internal static Schema GetSchema(Flatbuf.Schema schema)
+ internal static Schema GetSchema(Flatbuf.Schema schema,
LazyCreator<DictionaryMemo> lazyDictionaryMemo)
Review comment:
Along with the comment about removing `LazyCreator`, this could be
changed to:
```suggestion
internal static Schema GetSchema(Flatbuf.Schema schema, ref
DictionaryMemo dictionaryMemo)
```
And then if this code needs to use the `dictionaryMemo` and it is null, it
gets created when it is needed. And the `ref` gets set.
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,13 +45,48 @@ protected override void Dispose(bool disposing)
}
}
+ protected void ReadInitialDictionaries()
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (_lazyDictionaryMemo.IsCreated) {
+ int fieldCount = _lazyDictionaryMemo.Instance.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
+ {
+ ReadArrowObject();
+ }
+ }
+ HasReadInitialDictionary = true;
+ }
+
+ protected async ValueTask
ReadInitialDictionariesAsync(CancellationToken cancellationToken = default)
Review comment:
```suggestion
private async ValueTask
ReadInitialDictionariesAsync(CancellationToken cancellationToken)
```
##########
File path: csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
##########
@@ -73,13 +73,27 @@ internal static Schema GetSchema(Flatbuf.Schema schema)
return new Schema(fields, metadata, copyCollections: false);
}
- private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField)
+ private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField,
LazyCreator<DictionaryMemo> lazyDictionaryMemo)
{
Field[] childFields = flatbufField.ChildrenLength > 0 ? new
Field[flatbufField.ChildrenLength] : null;
for (int i = 0; i < flatbufField.ChildrenLength; i++)
{
Flatbuf.Field? childFlatbufField = flatbufField.Children(i);
- childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value);
+ childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value,
lazyDictionaryMemo);
+ }
+
+ Flatbuf.DictionaryEncoding? de = flatbufField.Dictionary;
+ IArrowType type = GetFieldArrowType(flatbufField, childFields);
+
+ if (de.HasValue)
+ {
+ Flatbuf.Int? indexTypeAsInt = de.Value.IndexType;
+ if (!indexTypeAsInt.HasValue)
+ {
+ throw new InvalidDataException("Dictionary type not
defined");
Review comment:
Would this be a more correct message?
```suggestion
throw new InvalidDataException("Dictionary IndexType not
defined");
```
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -59,6 +96,66 @@ protected async ValueTask<RecordBatch>
ReadRecordBatchAsync(CancellationToken ca
{
await ReadSchemaAsync().ConfigureAwait(false);
+ await ReadInitialDictionariesAsync().ConfigureAwait(false);
+
+ return await ReadArrowObjectAsync().ConfigureAwait(false);
Review comment:
```suggestion
await
ReadInitialDictionariesAsync(cancellationToken).ConfigureAwait(false);
return await ReadArrowObjectAsync(cancellationToken
).ConfigureAwait(false);
```
##########
File path: csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs
##########
@@ -32,7 +32,7 @@ public class ArrowStreamWriterTests
[Fact]
public void Ctor_LeaveOpenDefault_StreamClosedOnDispose()
{
- RecordBatch originalBatch =
TestData.CreateSampleRecordBatch(length: 100);
+ RecordBatch originalBatch =
TestData.CreateSampleRecordBatch(length: 100, createDictionaryArray: true);
Review comment:
I'd prefer if these were left as-is. There is no reason to create
dictionary arrays for these dispose tests.
##########
File path: csharp/src/Apache.Arrow/Ipc/DictionaryMemo.cs
##########
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow.Ipc
+{
+ class DictionaryMemo
+ {
+ private Dictionary<long, IArrowArray> _idToDictionary;
+ private Dictionary<long, IArrowType> _idToValueType;
+ private Dictionary<Field, long> _fieldToId;
+
+ public DictionaryMemo()
+ {
+ _idToDictionary = new Dictionary<long, IArrowArray>();
+ _idToValueType = new Dictionary<long, IArrowType>();
+ _fieldToId = new Dictionary<Field, long>();
+ }
+
+ public IArrowType GetDictionaryType(long id)
+ {
+ if (!_idToValueType.TryGetValue(id, out IArrowType type))
+ {
+ throw new ArgumentException($"Dictionary with id {id} not
found");
+ }
+ return type;
+ }
+
+ public IArrowArray GetDictionary(long id)
+ {
+ if (!_idToDictionary.TryGetValue(id, out IArrowArray dictionary))
+ {
+ throw new ArgumentException($"Dictionary with id {id} not
found");
+ }
+ return dictionary;
+ }
+
+ public void AddField(long id, Field field)
+ {
+ if (_fieldToId.ContainsKey(field))
+ {
+ throw new ArgumentException($"Field {field.Name} is already in
Memo");
+ }
+
+ if (field.DataType.TypeId != ArrowTypeId.Dictionary)
+ {
+ throw new ArgumentException($"Field type is not
DictionaryType: Name={field.Name}, {field.DataType.Name}");
+ }
+
+ IArrowType valueType = ((DictionaryType)field.DataType).ValueType;
+
+ if (_idToValueType.TryGetValue(id, out IArrowType valueTypeInDic))
+ {
+ if (valueType != valueTypeInDic)
+ {
+ throw new ArgumentException($"Field type
{field.DataType.Name} does not match the existing type {valueTypeInDic})");
+ }
+ }
+
+ _fieldToId.Add(field, id);
+ _idToValueType.Add(id, valueType);
+ }
+
+ public long GetId(Field field)
+ {
+ if (!_fieldToId.TryGetValue(field, out long id))
+ {
+ throw new ArgumentException($"Field with name {field.Name} not
found");
+ }
+ return id;
+ }
+
+ public long GetOrAssignId(Field field)
+ {
+ if (!_fieldToId.TryGetValue(field, out long id))
+ {
+ id = _fieldToId.Count + 1;
+ AddField(id, field);
+ }
+ return id;
+ }
+
+ public void AddOrReplaceDictionary(long id, IArrowArray dictionary)
+ {
+ _idToDictionary[id] = dictionary;
+ }
+
+ public void AddDictionaryDelta(long id, IArrowArray dictionary)
Review comment:
This hasn't been deleted (yet)
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -43,13 +45,48 @@ protected override void Dispose(bool disposing)
}
}
+ protected void ReadInitialDictionaries()
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ if (_lazyDictionaryMemo.IsCreated) {
+ int fieldCount = _lazyDictionaryMemo.Instance.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
+ {
+ ReadArrowObject();
+ }
+ }
+ HasReadInitialDictionary = true;
+ }
+
+ protected async ValueTask
ReadInitialDictionariesAsync(CancellationToken cancellationToken = default)
+ {
+ if (HasReadInitialDictionary)
+ {
+ return;
+ }
+
+ int fieldCount = _lazyDictionaryMemo.Instance.GetFieldCount();
+ for (int i = 0; i < fieldCount; ++i)
+ {
+ await
ReadArrowObjectAsync(cancellationToken).ConfigureAwait(false);
+ }
+
+ HasReadInitialDictionary = true;
+ }
+
public override async ValueTask<RecordBatch>
ReadNextRecordBatchAsync(CancellationToken cancellationToken)
{
// TODO: Loop until a record batch is read.
cancellationToken.ThrowIfCancellationRequested();
return await
ReadRecordBatchAsync(cancellationToken).ConfigureAwait(false);
}
+
Review comment:
These empty lines are unnecessary
##########
File path: csharp/src/Apache.Arrow/Ipc/ArrowStreamReaderImplementation.cs
##########
@@ -27,8 +27,10 @@ internal class ArrowStreamReaderImplementation :
ArrowReaderImplementation
public Stream BaseStream { get; }
private readonly bool _leaveOpen;
private readonly MemoryAllocator _allocator;
+ private protected bool HasReadInitialDictionary { get; set; }
Review comment:
```suggestion
private bool HasReadInitialDictionary { get; set; }
```
Nothing outside this class uses this property.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]