This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c87faf82a4 GH-36588: [C#] Support blank column names and enable more
integration tests. (#39167)
c87faf82a4 is described below
commit c87faf82a410a71751fdf85391c0dca20088b7d6
Author: Curt Hagenlocher <[email protected]>
AuthorDate: Sun Dec 10 18:01:35 2023 -0800
GH-36588: [C#] Support blank column names and enable more integration
tests. (#39167)
### What changes are included in this PR?
Allows field names to be blank (but not null).
Enables schema metadata to be read from JSON integration tests.
Enables integration tests for cases that are now working.
### Are these changes tested?
Yes.
* Closes: #36588
Authored-by: Curt Hagenlocher <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
---
csharp/src/Apache.Arrow/Field.Builder.cs | 3 +-
csharp/src/Apache.Arrow/Field.cs | 11 +++--
csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs | 9 ++--
csharp/src/Apache.Arrow/Types/MapType.cs | 48 ++++++++++++++++++++--
.../test/Apache.Arrow.IntegrationTest/JsonFile.cs | 6 +++
.../CDataInterfacePythonTests.cs | 44 ++++++++++++++++++++
dev/archery/archery/integration/datagen.py | 6 +--
7 files changed, 105 insertions(+), 22 deletions(-)
diff --git a/csharp/src/Apache.Arrow/Field.Builder.cs
b/csharp/src/Apache.Arrow/Field.Builder.cs
index 1e7aa192e0..f7f33383e2 100644
--- a/csharp/src/Apache.Arrow/Field.Builder.cs
+++ b/csharp/src/Apache.Arrow/Field.Builder.cs
@@ -30,14 +30,13 @@ namespace Apache.Arrow
public Builder()
{
- _name = string.Empty;
_type = NullType.Default;
_nullable = true;
}
public Builder Name(string value)
{
- if (string.IsNullOrWhiteSpace(value))
+ if (value == null)
{
throw new ArgumentNullException(nameof(value));
}
diff --git a/csharp/src/Apache.Arrow/Field.cs b/csharp/src/Apache.Arrow/Field.cs
index 4fddd1bc4e..ac3cafac93 100644
--- a/csharp/src/Apache.Arrow/Field.cs
+++ b/csharp/src/Apache.Arrow/Field.cs
@@ -35,24 +35,23 @@ namespace Apache.Arrow
public Field(string name, IArrowType dataType, bool nullable,
IEnumerable<KeyValuePair<string, string>> metadata = default)
- : this(name, dataType, nullable, false)
+ : this(name, dataType, nullable)
{
Metadata = metadata?.ToDictionary(kv => kv.Key, kv => kv.Value);
-
}
internal Field(string name, IArrowType dataType, bool nullable,
- IReadOnlyDictionary<string, string> metadata, bool
copyCollections, bool allowBlankName)
- : this(name, dataType, nullable, allowBlankName)
+ IReadOnlyDictionary<string, string> metadata, bool copyCollections)
+ : this(name, dataType, nullable)
{
Debug.Assert(copyCollections == false, "This internal constructor
is to not copy the collections.");
Metadata = metadata;
}
- private Field(string name, IArrowType dataType, bool nullable, bool
allowBlankName)
+ private Field(string name, IArrowType dataType, bool nullable)
{
- if (name == null || (!allowBlankName &&
string.IsNullOrWhiteSpace(name)))
+ if (name == null)
{
throw new ArgumentNullException(nameof(name));
}
diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
index 3f504cf3b9..633554fc53 100644
--- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
+++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs
@@ -59,7 +59,7 @@ namespace Apache.Arrow.Ipc
for (int i = 0; i < schema.FieldsLength; i++)
{
Flatbuf.Field field = schema.Fields(i).GetValueOrDefault();
- fields.Add(FieldFromFlatbuffer(field, ref dictionaryMemo,
allowBlankName: false));
+ fields.Add(FieldFromFlatbuffer(field, ref dictionaryMemo));
}
Dictionary<string, string> metadata = schema.CustomMetadataLength
> 0 ? new Dictionary<string, string>() : null;
@@ -73,14 +73,13 @@ namespace Apache.Arrow.Ipc
return new Schema(fields, metadata, copyCollections: false);
}
- private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField,
ref DictionaryMemo dictionaryMemo, bool allowBlankName)
+ private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField,
ref DictionaryMemo dictionaryMemo)
{
- bool allowBlankNameChild = flatbufField.ChildrenLength == 1 &&
flatbufField.TypeType == Flatbuf.Type.FixedSizeList;
Field[] childFields = flatbufField.ChildrenLength > 0 ? new
Field[flatbufField.ChildrenLength] : null;
for (int i = 0; i < flatbufField.ChildrenLength; i++)
{
Flatbuf.Field? childFlatbufField = flatbufField.Children(i);
- childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value,
ref dictionaryMemo, allowBlankNameChild);
+ childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value,
ref dictionaryMemo);
}
Flatbuf.DictionaryEncoding? dictionaryEncoding =
flatbufField.Dictionary;
@@ -104,7 +103,7 @@ namespace Apache.Arrow.Ipc
metadata[keyValue.Key] = keyValue.Value;
}
- var arrowField = new Field(flatbufField.Name, type,
flatbufField.Nullable, metadata, copyCollections: false, allowBlankName);
+ var arrowField = new Field(flatbufField.Name, type,
flatbufField.Nullable, metadata, copyCollections: false);
if (dictionaryEncoding.HasValue)
{
diff --git a/csharp/src/Apache.Arrow/Types/MapType.cs
b/csharp/src/Apache.Arrow/Types/MapType.cs
index 73112c815b..47e0be48f9 100644
--- a/csharp/src/Apache.Arrow/Types/MapType.cs
+++ b/csharp/src/Apache.Arrow/Types/MapType.cs
@@ -13,12 +13,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+using System;
using System.Collections.Generic;
namespace Apache.Arrow.Types
{
public sealed class MapType : NestedType // MapType =
ListType(StructType("key", "value"))
{
+ private const string EntriesKey = "entries";
+ private const string KeyKey = "key";
+ private const string ValueKey = "value";
+
public override ArrowTypeId TypeId => ArrowTypeId.Map;
public override string Name => "map";
public readonly bool KeySorted;
@@ -28,20 +33,23 @@ namespace Apache.Arrow.Types
public Field ValueField => KeyValueType.Fields[1];
public MapType(IArrowType key, IArrowType value, bool nullable = true,
bool keySorted = false)
- : this(new Field("key", key, false), new Field("value", value,
nullable), keySorted)
+ : base(Entries(key, value, nullable))
{
+ KeySorted = keySorted;
}
public MapType(Field key, Field value, bool keySorted = false)
- : this(new StructType(new List<Field>() { key, value }), keySorted)
+ : base(Entries(key, value))
{
+ KeySorted = keySorted;
}
- public MapType(StructType entries, bool keySorted = false) : this(new
Field("entries", entries, false), keySorted)
+ public MapType(StructType entries, bool keySorted = false) :
base(Entries(entries))
{
+ KeySorted = keySorted;
}
- public MapType(Field entries, bool keySorted = false) : base(entries)
+ public MapType(Field entries, bool keySorted = false) :
base(Entries(entries))
{
KeySorted = keySorted;
}
@@ -54,5 +62,37 @@ namespace Apache.Arrow.Types
return new MapType(Fields[0], keySorted: false);
}
+
+ private static Field Entries(IArrowType key, IArrowType value, bool
nullable) =>
+ new Field(EntriesKey, NewStruct(new Field(KeyKey, key, false), new
Field(ValueKey, value, nullable)), false);
+
+ private static Field Entries(Field key, Field value) =>
+ new Field(EntriesKey, NewStruct(NamedField(KeyKey, key),
NamedField(ValueKey, value)), false);
+
+ private static Field Entries(StructType entries)
+ {
+ return new Field(EntriesKey, Struct(entries), false);
+ }
+
+ private static StructType NewStruct(Field key, Field value) => new
StructType(new[] { key, value });
+
+ private static StructType Struct(StructType entries)
+ {
+ Field key = NamedField(KeyKey, entries.Fields[0]);
+ Field value = NamedField(ValueKey, entries.Fields[1]);
+ return object.ReferenceEquals(key, entries.Fields[0]) &&
object.ReferenceEquals(value, entries.Fields[1]) ? entries : NewStruct(key,
value);
+ }
+
+ private static Field Entries(Field entries)
+ {
+ StructType structType = (StructType)entries.DataType;
+ StructType adjustedStruct = Struct(structType);
+ return StringComparer.Ordinal.Equals(entries.Name, EntriesKey) &&
object.ReferenceEquals(structType, adjustedStruct) ? entries : new
Field(EntriesKey, adjustedStruct, false);
+ }
+
+ private static Field NamedField(string name, Field field)
+ {
+ return StringComparer.Ordinal.Equals(name, field.Name) ? field :
new Field(name, field.DataType, field.IsNullable, field.Metadata);
+ }
}
}
diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs
b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs
index 51bcf6dd75..f3fe73588a 100644
--- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs
+++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs
@@ -120,6 +120,12 @@ namespace Apache.Arrow.IntegrationTest
{
builder.Field(f => CreateField(f, jsonSchema.Fields[i],
dictionaryIndexes));
}
+
+ if (jsonSchema.Metadata != null)
+ {
+ builder.Metadata(jsonSchema.Metadata);
+ }
+
return builder.Build();
}
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
index a3b53a40db..83902d8d93 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
@@ -829,6 +829,50 @@ namespace Apache.Arrow.Tests
CArrowArrayStream.Free(cArrayStream);
}
+ [SkippableFact]
+ public unsafe void ImportRecordBatchFromBuffer()
+ {
+ using (Py.GIL())
+ {
+ dynamic pa = Py.Import("pyarrow");
+ dynamic batch = pa.record_batch(
+ new PyList(new PyObject[]
+ {
+ pa.array(new long?[] { 1, 2, 3, null, 5 }),
+ pa.array(new[] { "hello", "world", null, "foo", "bar"
}),
+ pa.array(new[] { 0.0, 1.4, 2.5, 3.6, 4.7 }),
+ pa.array(new bool?[] { true, false, null, false, true
}),
+ }),
+ new[] { null, "", "column", "column" });
+
+ dynamic sink = pa.BufferOutputStream();
+ dynamic writer = pa.ipc.new_stream(sink, batch.schema);
+ writer.write_batch(batch);
+ ((IDisposable)writer).Dispose();
+
+ dynamic buf = sink.getvalue();
+ // buf.address, buf.size
+
+ IntPtr address = (IntPtr)(long)buf.address;
+ int size = buf.size;
+ byte[] buffer = new byte[size];
+ byte* ptr = (byte*)address.ToPointer();
+ for (int i = 0; i < size; i++)
+ {
+ buffer[i] = ptr[i];
+ }
+
+ using (ArrowStreamReader reader = new ArrowStreamReader(new
ReadOnlyMemory<byte>(buffer)))
+ {
+ RecordBatch batch2 = reader.ReadNextRecordBatch();
+ Assert.Equal("None", batch2.Schema.FieldsList[0].Name);
+ Assert.Equal("", batch2.Schema.FieldsList[1].Name);
+ Assert.Equal("column", batch2.Schema.FieldsList[2].Name);
+ Assert.Equal("column", batch2.Schema.FieldsList[3].Name);
+ }
+ }
+ }
+
private static PyObject List(params int?[] values)
{
return new PyList(values.Select(i => i == null ? PyObject.None :
new PyInt(i.Value)).ToArray());
diff --git a/dev/archery/archery/integration/datagen.py
b/dev/archery/archery/integration/datagen.py
index eca1effff6..29b203ae13 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1812,7 +1812,6 @@ def get_generated_json_files(tempdir=None):
generate_map_case(),
generate_non_canonical_map_case()
- .skip_tester('C#')
.skip_tester('Java') # TODO(ARROW-8715)
# Canonical map names are restored on import, so the schemas are
unequal
.skip_format(SKIP_C_SCHEMA, 'C++'),
@@ -1827,11 +1826,9 @@ def get_generated_json_files(tempdir=None):
generate_unions_case(),
- generate_custom_metadata_case()
- .skip_tester('C#'),
+ generate_custom_metadata_case(),
generate_duplicate_fieldnames_case()
- .skip_tester('C#')
.skip_tester('JS'),
generate_dictionary_case(),
@@ -1856,7 +1853,6 @@ def get_generated_json_files(tempdir=None):
.skip_tester('Rust'),
generate_extension_case()
- .skip_tester('C#')
# TODO: ensure the extension is registered in the C++ entrypoint
.skip_format(SKIP_C_SCHEMA, 'C++')
.skip_format(SKIP_C_ARRAY, 'C++'),