This is an automated email from the ASF dual-hosted git repository.
eerhardt pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3e71ea0 ARROW-10719: [C#] ArrowStreamWriter doesn't write schema
metadata
3e71ea0 is described below
commit 3e71ea0e3f20ae3b883882d61f394fdca9577eb1
Author: Eric Erhardt <[email protected]>
AuthorDate: Tue Dec 15 21:42:42 2020 -0600
ARROW-10719: [C#] ArrowStreamWriter doesn't write schema metadata
Add support for reading and writing Schema and Field metadata
Also, reducing some allocations in a few spots.
Tagging people who have contributed to the C# library in the past year for
review. (Feel free to ignore if you don't have time.)
@pgovind @suhsteve @chutchinson @Ulimo @mr-smidge @zgramana @HashidaTKS
@nhustler
cc @olgacarpenter
Closes #8902 from eerhardt/Metadata
Authored-by: Eric Erhardt <[email protected]>
Signed-off-by: Eric Erhardt <[email protected]>
---
csharp/src/Apache.Arrow/Field.Builder.cs | 5 +-
csharp/src/Apache.Arrow/Field.cs | 18 ++++++-
csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 60 ++++++++++++++++++----
csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs | 39 +++++++++-----
csharp/src/Apache.Arrow/Schema.Builder.cs | 8 +--
csharp/src/Apache.Arrow/Schema.cs | 15 ++++++
.../Apache.Arrow.Tests/ArrowStreamWriterTests.cs | 36 ++++++++++++-
csharp/test/Apache.Arrow.Tests/FieldComparer.cs | 2 +-
csharp/test/Apache.Arrow.Tests/SchemaComparer.cs | 2 +-
csharp/test/Apache.Arrow.Tests/TestData.cs | 25 ++++-----
10 files changed, 159 insertions(+), 51 deletions(-)
diff --git a/csharp/src/Apache.Arrow/Field.Builder.cs
b/csharp/src/Apache.Arrow/Field.Builder.cs
index 6d820b6..1e7aa19 100644
--- a/csharp/src/Apache.Arrow/Field.Builder.cs
+++ b/csharp/src/Apache.Arrow/Field.Builder.cs
@@ -23,14 +23,13 @@ namespace Apache.Arrow
{
public class Builder
{
- private readonly Dictionary<string, string> _metadata;
+ private Dictionary<string, string> _metadata;
private string _name;
private IArrowType _type;
private bool _nullable;
public Builder()
{
- _metadata = new Dictionary<string, string>();
_name = string.Empty;
_type = NullType.Default;
_nullable = true;
@@ -66,6 +65,8 @@ namespace Apache.Arrow
throw new ArgumentNullException(nameof(key));
}
+ _metadata ??= new Dictionary<string, string>();
+
_metadata[key] = value;
return this;
}
diff --git a/csharp/src/Apache.Arrow/Field.cs b/csharp/src/Apache.Arrow/Field.cs
index 33aefb7..6e507b6 100644
--- a/csharp/src/Apache.Arrow/Field.cs
+++ b/csharp/src/Apache.Arrow/Field.cs
@@ -15,6 +15,7 @@
using System;
using System.Collections.Generic;
+using System.Diagnostics;
using System.Linq;
using Apache.Arrow.Types;
@@ -34,6 +35,22 @@ namespace Apache.Arrow
public Field(string name, IArrowType dataType, bool nullable,
IEnumerable<KeyValuePair<string, string>> metadata = default)
+ : this(name, dataType, nullable)
+ {
+ Metadata = metadata?.ToDictionary(kv => kv.Key, kv => kv.Value);
+
+ }
+
+ internal Field(string name, IArrowType dataType, bool nullable,
+ IReadOnlyDictionary<string, string> metadata, bool copyCollections)
+ : this(name, dataType, nullable)
+ {
+ Debug.Assert(copyCollections == false, "This internal constructor
is to not copy the collections.");
+
+ Metadata = metadata;
+ }
+
+ private Field(string name, IArrowType dataType, bool nullable)
{
if (string.IsNullOrWhiteSpace(name))
{
@@ -43,7 +60,6 @@ namespace Apache.Arrow
Name = name;
DataType = dataType ?? NullType.Default;
IsNullable = nullable;
- Metadata = metadata?.ToDictionary(kv => kv.Key, kv => kv.Value);
}
}
}
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index b42724c..3ad6ed2 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -17,6 +17,7 @@ using System;
using System.Buffers;
using System.Buffers.Binary;
using System.Collections.Generic;
+using System.Diagnostics;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
@@ -442,23 +443,28 @@ namespace Apache.Arrow.Ipc
private protected Offset<Flatbuf.Schema> SerializeSchema(Schema schema)
{
- // TODO: Serialize schema metadata
+ // Build metadata
+ VectorOffset metadataVectorOffset = default;
+ if (schema.HasMetadata)
+ {
+ Offset<Flatbuf.KeyValue>[] metadataOffsets =
GetMetadataOffsets(schema.Metadata);
+ metadataVectorOffset =
Flatbuf.Schema.CreateCustomMetadataVector(Builder, metadataOffsets);
+ }
// Build fields
-
var fieldOffsets = new Offset<Flatbuf.Field>[schema.Fields.Count];
-
for (int i = 0; i < fieldOffsets.Length; i++)
{
Field field = schema.GetFieldByIndex(i);
StringOffset fieldNameOffset =
Builder.CreateString(field.Name);
ArrowTypeFlatbufferBuilder.FieldType fieldType =
_fieldTypeBuilder.BuildFieldType(field);
- VectorOffset fieldChildrenVectorOffset =
Builder.CreateVectorOfTables(GetChildrenFieldOffsets(field));
+ VectorOffset fieldChildrenVectorOffset =
GetChildrenFieldOffset(field);
+ VectorOffset fieldMetadataVectorOffset =
GetFieldMetadataOffset(field);
fieldOffsets[i] = Flatbuf.Field.CreateField(Builder,
fieldNameOffset, field.IsNullable, fieldType.Type,
fieldType.Offset,
- default, fieldChildrenVectorOffset, default);
+ default, fieldChildrenVectorOffset,
fieldMetadataVectorOffset);
}
VectorOffset fieldsVectorOffset =
Flatbuf.Schema.CreateFieldsVector(Builder, fieldOffsets);
@@ -468,14 +474,14 @@ namespace Apache.Arrow.Ipc
Flatbuf.Endianness endianness = BitConverter.IsLittleEndian ?
Flatbuf.Endianness.Little : Flatbuf.Endianness.Big;
return Flatbuf.Schema.CreateSchema(
- Builder, endianness, fieldsVectorOffset);
+ Builder, endianness, fieldsVectorOffset, metadataVectorOffset);
}
- private protected Offset<Flatbuf.Field>[]
GetChildrenFieldOffsets(Field field)
+ private VectorOffset GetChildrenFieldOffset(Field field)
{
if (!(field.DataType is NestedType type))
{
- return System.Array.Empty<Offset<Flatbuf.Field>>();
+ return default;
}
int childrenCount = type.Fields.Count;
@@ -486,13 +492,45 @@ namespace Apache.Arrow.Ipc
Field childField = type.Fields[i];
StringOffset childFieldNameOffset =
Builder.CreateString(childField.Name);
ArrowTypeFlatbufferBuilder.FieldType childFieldType =
_fieldTypeBuilder.BuildFieldType(childField);
- VectorOffset childFieldChildrenVectorOffset =
Builder.CreateVectorOfTables(GetChildrenFieldOffsets(childField));
+
+ VectorOffset childFieldChildrenVectorOffset =
GetChildrenFieldOffset(childField);
+ VectorOffset childFieldMetadataVectorOffset =
GetFieldMetadataOffset(childField);
children[i] = Flatbuf.Field.CreateField(Builder,
childFieldNameOffset, childField.IsNullable,
childFieldType.Type, childFieldType.Offset,
- default, childFieldChildrenVectorOffset, default);
+ default, childFieldChildrenVectorOffset,
childFieldMetadataVectorOffset);
+ }
+
+ return Builder.CreateVectorOfTables(children);
+ }
+
+ private VectorOffset GetFieldMetadataOffset(Field field)
+ {
+ if (!field.HasMetadata)
+ {
+ return default;
+ }
+
+ Offset<Flatbuf.KeyValue>[] metadataOffsets =
GetMetadataOffsets(field.Metadata);
+ return Flatbuf.Field.CreateCustomMetadataVector(Builder,
metadataOffsets);
+ }
+
+ private Offset<Flatbuf.KeyValue>[]
GetMetadataOffsets(IReadOnlyDictionary<string, string> metadata)
+ {
+ Debug.Assert(metadata != null);
+ Debug.Assert(metadata.Count > 0);
+
+ Offset<Flatbuf.KeyValue>[] metadataOffsets = new
Offset<Flatbuf.KeyValue>[metadata.Count];
+ int index = 0;
+ foreach (KeyValuePair<string, string> metadatum in metadata)
+ {
+ StringOffset keyOffset = Builder.CreateString(metadatum.Key);
+ StringOffset valueOffset =
Builder.CreateString(metadatum.Value);
+
+ metadataOffsets[index++] =
Flatbuf.KeyValue.CreateKeyValue(Builder, keyOffset, valueOffset);
}
- return children;
+
+ return metadataOffsets;
}
private Offset<Flatbuf.Schema> WriteSchema(Schema schema)
diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
index e86336a..b39e749 100644
--- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
+++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
@@ -14,6 +14,7 @@
// limitations under the License.
using System;
+using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
@@ -53,31 +54,43 @@ namespace Apache.Arrow.Ipc
internal static Schema GetSchema(Flatbuf.Schema schema)
{
- var schemaBuilder = new Schema.Builder();
-
+ List<Field> fields = new List<Field>();
for (int i = 0; i < schema.FieldsLength; i++)
{
Flatbuf.Field field = schema.Fields(i).GetValueOrDefault();
- schemaBuilder.Field(FieldFromFlatbuffer(field));
+ fields.Add(FieldFromFlatbuffer(field));
+ }
+
+ Dictionary<string, string> metadata = schema.CustomMetadataLength
> 0 ? new Dictionary<string, string>() : null;
+ for (int i = 0; i < schema.CustomMetadataLength; i++)
+ {
+ Flatbuf.KeyValue keyValue =
schema.CustomMetadata(i).GetValueOrDefault();
+
+ metadata[keyValue.Key] = keyValue.Value;
}
- return schemaBuilder.Build();
+ return new Schema(fields, metadata, copyCollections: false);
}
private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField)
{
- Field[] childFields = null;
- if (flatbufField.ChildrenLength > 0)
+ Field[] childFields = flatbufField.ChildrenLength > 0 ? new
Field[flatbufField.ChildrenLength] : null;
+ for (int i = 0; i < flatbufField.ChildrenLength; i++)
{
- childFields = new Field[flatbufField.ChildrenLength];
- for (int i = 0; i < flatbufField.ChildrenLength; i++)
- {
- Flatbuf.Field? childFlatbufField =
flatbufField.Children(i);
- childFields[i] =
FieldFromFlatbuffer(childFlatbufField.Value);
- }
+ Flatbuf.Field? childFlatbufField = flatbufField.Children(i);
+ childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value);
}
- return new Field(flatbufField.Name,
GetFieldArrowType(flatbufField, childFields), flatbufField.Nullable);
+
+ Dictionary<string, string> metadata =
flatbufField.CustomMetadataLength > 0 ? new Dictionary<string, string>() : null;
+ for (int i = 0; i < flatbufField.CustomMetadataLength; i++)
+ {
+ Flatbuf.KeyValue keyValue =
flatbufField.CustomMetadata(i).GetValueOrDefault();
+
+ metadata[keyValue.Key] = keyValue.Value;
+ }
+
+ return new Field(flatbufField.Name,
GetFieldArrowType(flatbufField, childFields), flatbufField.Nullable, metadata,
copyCollections: false);
}
private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field,
Field[] childFields = null)
diff --git a/csharp/src/Apache.Arrow/Schema.Builder.cs
b/csharp/src/Apache.Arrow/Schema.Builder.cs
index 277728b..89e9e3c 100644
--- a/csharp/src/Apache.Arrow/Schema.Builder.cs
+++ b/csharp/src/Apache.Arrow/Schema.Builder.cs
@@ -20,22 +20,20 @@ namespace Apache.Arrow
{
public partial class Schema
{
-
public class Builder
{
private readonly List<Field> _fields;
- private readonly Dictionary<string, string> _metadata;
+ private Dictionary<string, string> _metadata;
public Builder()
{
_fields = new List<Field>();
- _metadata = new Dictionary<string, string>();
}
public Builder Clear()
{
_fields.Clear();
- _metadata.Clear();
+ _metadata?.Clear();
return this;
}
@@ -66,6 +64,8 @@ namespace Apache.Arrow
throw new ArgumentNullException(nameof(key));
}
+ _metadata ??= new Dictionary<string, string>();
+
_metadata[key] = value;
return this;
}
diff --git a/csharp/src/Apache.Arrow/Schema.cs
b/csharp/src/Apache.Arrow/Schema.cs
index f330fbf..59d1921 100644
--- a/csharp/src/Apache.Arrow/Schema.cs
+++ b/csharp/src/Apache.Arrow/Schema.cs
@@ -15,6 +15,7 @@
using System;
using System.Collections.Generic;
+using System.Diagnostics;
using System.Linq;
namespace Apache.Arrow
@@ -53,6 +54,20 @@ namespace Apache.Arrow
Metadata = metadata?.ToDictionary(kv => kv.Key, kv => kv.Value);
}
+ internal Schema(List<Field> fields, IReadOnlyDictionary<string,
string> metadata, bool copyCollections)
+ {
+ Debug.Assert(fields != null);
+ Debug.Assert(copyCollections == false, "This internal constructor
is to not copy the collections.");
+
+ _fields = fields;
+
+ _fieldsDictionary = fields.ToDictionary(
+ field => field.Name, field => field,
+ StringComparer.OrdinalIgnoreCase);
+
+ Metadata = metadata;
+ }
+
public Field GetFieldByIndex(int i)
{
return _fields[i];
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs
b/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs
index b65d735..44546f1 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowStreamWriterTests.cs
@@ -13,15 +13,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-using Apache.Arrow.Ipc;
-using Apache.Arrow.Types;
using System;
using System.Buffers.Binary;
+using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Sockets;
using System.Threading.Tasks;
+using Apache.Arrow.Ipc;
+using Apache.Arrow.Types;
using Xunit;
namespace Apache.Arrow.Tests
@@ -462,5 +463,36 @@ namespace Apache.Arrow.Tests
Assert.Equal(0, endOfBuffer2);
}
}
+
+ [Fact]
+ public void WritesMetadataCorrectly()
+ {
+ Schema.Builder schemaBuilder = new Schema.Builder()
+ .Metadata("index", "1, 2, 3, 4, 5")
+ .Metadata("reverseIndex", "5, 4, 3, 2, 1")
+ .Field(f => f
+ .Name("IntCol")
+ .DataType(UInt32Type.Default)
+ .Metadata("custom1", "false")
+ .Metadata("custom2", "true"))
+ .Field(f => f
+ .Name("StringCol")
+ .DataType(StringType.Default)
+ .Metadata("custom2", "false")
+ .Metadata("custom3", "4"))
+ .Field(f => f
+ .Name("StructCol")
+ .DataType(new StructType(new[] {
+ new Field("Inner1", FloatType.Default, nullable:
false),
+ new Field("Inner2", DoubleType.Default, nullable:
true, new Dictionary<string, string>() { { "customInner", "1" }, {
"customInner2", "3" } })
+ }))
+ .Metadata("custom4", "6.4")
+ .Metadata("custom1", "true"));
+
+ var schema = schemaBuilder.Build();
+ RecordBatch originalBatch =
TestData.CreateSampleRecordBatch(schema, length: 10);
+
+ TestRoundTripRecordBatch(originalBatch);
+ }
}
}
diff --git a/csharp/test/Apache.Arrow.Tests/FieldComparer.cs
b/csharp/test/Apache.Arrow.Tests/FieldComparer.cs
index fac5e05..d7dcc39 100644
--- a/csharp/test/Apache.Arrow.Tests/FieldComparer.cs
+++ b/csharp/test/Apache.Arrow.Tests/FieldComparer.cs
@@ -18,7 +18,7 @@ using Xunit;
namespace Apache.Arrow.Tests
{
- public class FieldComparer
+ public static class FieldComparer
{
public static void Compare(Field expected, Field actual)
{
diff --git a/csharp/test/Apache.Arrow.Tests/SchemaComparer.cs
b/csharp/test/Apache.Arrow.Tests/SchemaComparer.cs
index d1fc636..3546d5e 100644
--- a/csharp/test/Apache.Arrow.Tests/SchemaComparer.cs
+++ b/csharp/test/Apache.Arrow.Tests/SchemaComparer.cs
@@ -18,7 +18,7 @@ using Xunit;
namespace Apache.Arrow.Tests
{
- public class SchemaComparer
+ public static class SchemaComparer
{
public static void Compare(Schema expected, Schema actual)
{
diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs
b/csharp/test/Apache.Arrow.Tests/TestData.cs
index 0066f49..5688c62 100644
--- a/csharp/test/Apache.Arrow.Tests/TestData.cs
+++ b/csharp/test/Apache.Arrow.Tests/TestData.cs
@@ -58,6 +58,11 @@ namespace Apache.Arrow.Tests
Schema schema = builder.Build();
+ return CreateSampleRecordBatch(schema, length);
+ }
+
+ public static RecordBatch CreateSampleRecordBatch(Schema schema, int
length)
+ {
IEnumerable<IArrowArray> arrays = CreateArrays(schema, length);
return new RecordBatch(schema, arrays, length);
@@ -200,27 +205,15 @@ namespace Apache.Arrow.Tests
valueBuilder.Append(0);
Array = builder.Build();
-
}
public void Visit(StructType type)
{
- StringArray.Builder stringBuilder = new StringArray.Builder();
- for (int i = 0; i < Length; i++)
+ IArrowArray[] childArrays = new IArrowArray[type.Fields.Count];
+ for (int i = 0; i < childArrays.Length; i++)
{
- stringBuilder.Append(i.ToString());
+ childArrays[i] = CreateArray(type.Fields[i], Length);
}
- StringArray stringArray = stringBuilder.Build();
- Int32Array.Builder intBuilder = new Int32Array.Builder();
- for (int i = 0; i < Length; i++)
- {
- intBuilder.Append(i);
- }
- Int32Array intArray = intBuilder.Build();
-
- List<Array> arrays = new List<Array>();
- arrays.Add(stringArray);
- arrays.Add(intArray);
ArrowBuffer.BitmapBuilder nullBitmap = new
ArrowBuffer.BitmapBuilder();
for (int i = 0; i < Length; i++)
@@ -228,7 +221,7 @@ namespace Apache.Arrow.Tests
nullBitmap.Append(true);
}
- Array = new StructArray(type, Length, arrays,
nullBitmap.Build());
+ Array = new StructArray(type, Length, childArrays,
nullBitmap.Build());
}
private void GenerateArray<T, TArray,
TArrayBuilder>(IArrowArrayBuilder<T, TArray, TArrayBuilder> builder, Func<int,
T> generator)