This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3e3712a ARROW-8450: [Integration][C++] Implement large offsets types
3e3712a is described below
commit 3e3712a14a3242d70145fb9d3d6f0f4b8c374e68
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Apr 15 12:47:54 2020 -0400
ARROW-8450: [Integration][C++] Implement large offsets types
Implement integration tests for LargeList, LargeBinary and LargeString
types.
Enable them for C++ (only).
Also add tests for recursive nested types.
Closes #6934 from pitrou/ARROW-8450-integ-large-offset-types
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/ipc/json_integration_test.cc | 22 +--
cpp/src/arrow/ipc/json_internal.cc | 62 +++++--
cpp/src/arrow/ipc/json_test.cc | 16 +-
dev/archery/archery/cli.py | 8 +-
dev/archery/archery/integration/datagen.py | 283 ++++++++++++++++++++---------
dev/archery/archery/integration/util.py | 25 ++-
6 files changed, 272 insertions(+), 144 deletions(-)
diff --git a/cpp/src/arrow/ipc/json_integration_test.cc
b/cpp/src/arrow/ipc/json_integration_test.cc
index f2eea29..4185cb3 100644
--- a/cpp/src/arrow/ipc/json_integration_test.cc
+++ b/cpp/src/arrow/ipc/json_integration_test.cc
@@ -250,24 +250,12 @@ static const char* JSON_EXAMPLE = R"example(
{
"name": "foo",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
- "nullable": true, "children": [],
- "typeLayout": {
- "vectors": [
- {"type": "VALIDITY", "typeBitWidth": 1},
- {"type": "DATA", "typeBitWidth": 32}
- ]
- }
+ "nullable": true, "children": []
},
{
"name": "bar",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
- "nullable": true, "children": [],
- "typeLayout": {
- "vectors": [
- {"type": "VALIDITY", "typeBitWidth": 1},
- {"type": "DATA", "typeBitWidth": 64}
- ]
- }
+ "nullable": true, "children": []
}
]
},
@@ -318,12 +306,6 @@ static const char* JSON_EXAMPLE2 = R"example(
"name": "foo",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
"nullable": true, "children": [],
- "typeLayout": {
- "vectors": [
- {"type": "VALIDITY", "typeBitWidth": 1},
- {"type": "DATA", "typeBitWidth": 32}
- ]
- },
"metadata": [
{"key": "converted_from_time32", "value": "true"}
]
diff --git a/cpp/src/arrow/ipc/json_internal.cc
b/cpp/src/arrow/ipc/json_internal.cc
index 133681c..3dfae2a 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -38,8 +38,10 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
+#include "arrow/util/formatting.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
+#include "arrow/util/parsing.h"
#include "arrow/util/string.h"
#include "arrow/visitor_inline.h"
@@ -335,10 +337,8 @@ class SchemaWriter {
Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type);
}
- Status Visit(const LargeStringType& type) { return
WriteVarBytes("large_utf8", type); }
- Status Visit(const LargeBinaryType& type) {
- return WriteVarBytes("large_binary", type);
- }
+ Status Visit(const LargeStringType& type) { return
WriteVarBytes("largeutf8", type); }
+ Status Visit(const LargeBinaryType& type) { return
WriteVarBytes("largebinary", type); }
Status Visit(const FixedSizeBinaryType& type) {
return WritePrimitive("fixedsizebinary", type);
}
@@ -358,7 +358,7 @@ class SchemaWriter {
}
Status Visit(const LargeListType& type) {
- WriteName("large_list", type);
+ WriteName("largelist", type);
return Status::OK();
}
@@ -525,8 +525,21 @@ class ArrayWriter {
void WriteIntegerField(const char* name, const T* values, int64_t length) {
writer_->Key(name);
writer_->StartArray();
- for (int i = 0; i < length; ++i) {
- writer_->Int64(values[i]);
+ if (sizeof(T) < sizeof(int64_t)) {
+ for (int i = 0; i < length; ++i) {
+ writer_->Int64(values[i]);
+ }
+ } else {
+ // Represent 64-bit integers as strings, as JSON numbers cannot represent
+ // them exactly.
+ ::arrow::internal::StringFormatter<typename CTypeTraits<T>::ArrowType>
formatter;
+ auto append = [this](util::string_view v) {
+ writer_->String(v.data(), static_cast<rj::SizeType>(v.size()));
+ return Status::OK();
+ };
+ for (int i = 0; i < length; ++i) {
+ DCHECK_OK(formatter(values[i], append));
+ }
}
writer_->EndArray();
}
@@ -932,9 +945,9 @@ static Status GetType(const RjObject& json_type,
*type = utf8();
} else if (type_name == "binary") {
*type = binary();
- } else if (type_name == "large_utf8") {
+ } else if (type_name == "largeutf8") {
*type = large_utf8();
- } else if (type_name == "large_binary") {
+ } else if (type_name == "largebinary") {
*type = large_binary();
} else if (type_name == "fixedsizebinary") {
return GetFixedSizeBinary(json_type, type);
@@ -957,7 +970,7 @@ static Status GetType(const RjObject& json_type,
return Status::Invalid("List must have exactly one child");
}
*type = list(children[0]);
- } else if (type_name == "large_list") {
+ } else if (type_name == "largelist") {
if (children.size() != 1) {
return Status::Invalid("Large list must have exactly one child");
}
@@ -1299,13 +1312,28 @@ class ArrayReader {
ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(length * sizeof(T),
pool_));
T* values = reinterpret_cast<T*>(buffer->mutable_data());
- for (int i = 0; i < length; ++i) {
- const rj::Value& val = json_array[i];
- DCHECK(val.IsInt() || val.IsInt64());
- if (val.IsInt()) {
- values[i] = static_cast<T>(val.GetInt());
- } else {
- values[i] = static_cast<T>(val.GetInt64());
+ if (sizeof(T) < sizeof(int64_t)) {
+ for (int i = 0; i < length; ++i) {
+ const rj::Value& val = json_array[i];
+ DCHECK(val.IsInt() || val.IsInt64());
+ if (val.IsInt()) {
+ values[i] = static_cast<T>(val.GetInt());
+ } else {
+ values[i] = static_cast<T>(val.GetInt64());
+ }
+ }
+ } else {
+ // Read 64-bit integers as strings, as JSON numbers cannot represent
+ // them exactly.
+ ::arrow::internal::StringConverter<typename CTypeTraits<T>::ArrowType>
converter;
+ for (int i = 0; i < length; ++i) {
+ const rj::Value& val = json_array[i];
+ DCHECK(val.IsString());
+ if (!converter(val.GetString(), val.GetStringLength(), &values[i])) {
+ return Status::Invalid("Failed to parse integer: '",
+ std::string(val.GetString(),
val.GetStringLength()),
+ "'");
+ }
}
}
diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc
index bfc2fab..21a695a 100644
--- a/cpp/src/arrow/ipc/json_test.cc
+++ b/cpp/src/arrow/ipc/json_test.cc
@@ -337,24 +337,12 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) {
{
"name": "foo",
"type": {"name": "int", "isSigned": true, "bitWidth": 32},
- "nullable": true, "children": [],
- "typeLayout": {
- "vectors": [
- {"type": "VALIDITY", "typeBitWidth": 1},
- {"type": "DATA", "typeBitWidth": 32}
- ]
- }
+ "nullable": true, "children": []
},
{
"name": "bar",
"type": {"name": "floatingpoint", "precision": "DOUBLE"},
- "nullable": true, "children": [],
- "typeLayout": {
- "vectors": [
- {"type": "VALIDITY", "typeBitWidth": 1},
- {"type": "DATA", "typeBitWidth": 64}
- ]
- }
+ "nullable": true, "children": []
}
]
},
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 6c07e05..c8af1b2 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -346,7 +346,7 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output,
cmake_extras,
logger.debug(f"Running benchmark {rev_or_path}")
conf = CppBenchmarkRunner.default_configuration(
- cmake_extras=cmake_extras, **kwargs)
+ cmake_extras=cmake_extras, **kwargs)
runner_base = BenchmarkRunner.from_rev_or_path(
src, root, rev_or_path, conf)
@@ -399,7 +399,7 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output,
cmake_extras,
logger.debug(f"Running benchmark {rev_or_path}")
conf = CppBenchmarkRunner.default_configuration(
- cmake_extras=cmake_extras, **kwargs)
+ cmake_extras=cmake_extras, **kwargs)
runner_base = BenchmarkRunner.from_rev_or_path(
src, root, rev_or_path, conf,
@@ -497,7 +497,7 @@ def benchmark_diff(ctx, src, preserve, output, cmake_extras,
f"{baseline} (baseline)")
conf = CppBenchmarkRunner.default_configuration(
- cmake_extras=cmake_extras, **kwargs)
+ cmake_extras=cmake_extras, **kwargs)
runner_cont = BenchmarkRunner.from_rev_or_path(
src, root, contender, conf,
@@ -551,7 +551,7 @@ def _set_default(opt, default):
@click.option('stop_on_error', '-x', '--stop-on-error',
is_flag=True, default=False,
help='Stop on first error')
[email protected]('--gold_dirs', multiple=True,
[email protected]('--gold-dirs', multiple=True,
help="gold integration test file paths")
@click.option('-k', '--match',
help=("Substring for test names to include in run, "
diff --git a/dev/archery/archery/integration/datagen.py
b/dev/archery/archery/integration/datagen.py
index 0f81acc..3c15abc 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -24,12 +24,13 @@ import tempfile
import numpy as np
-from .util import (frombytes, rands, tobytes, SKIP_ARROW, SKIP_FLIGHT)
+from .util import (frombytes, tobytes, random_bytes, random_utf8,
+ SKIP_ARROW, SKIP_FLIGHT)
class Field(object):
- def __init__(self, name, nullable=True, metadata=[]):
+ def __init__(self, name, *, nullable=True, metadata=[]):
self.name = name
self.nullable = nullable
self.metadata = metadata
@@ -132,7 +133,7 @@ TEST_INT_MIN = ~TEST_INT_MAX
class IntegerField(PrimitiveField):
- def __init__(self, name, is_signed, bit_width, nullable=True,
+ def __init__(self, name, is_signed, bit_width, *, nullable=True,
metadata=[],
min_value=TEST_INT_MIN,
max_value=TEST_INT_MAX):
@@ -188,7 +189,7 @@ class DateField(IntegerField):
MILLISECOND: [-62135596800000, 253402214400000]
}
- def __init__(self, name, unit, nullable=True, metadata=[]):
+ def __init__(self, name, unit, *, nullable=True, metadata=[]):
bit_width = 32 if unit == self.DAY else 64
min_value, max_value = self._ranges[unit]
@@ -230,7 +231,7 @@ class TimeField(IntegerField):
'ns': [0, 86400000000000]
}
- def __init__(self, name, unit='s', nullable=True,
+ def __init__(self, name, unit='s', *, nullable=True,
metadata=[]):
min_val, max_val = self._ranges[unit]
super(TimeField, self).__init__(name, True, self.BIT_WIDTHS[unit],
@@ -258,7 +259,7 @@ class TimestampField(IntegerField):
'ns': [np.iinfo('int64').min, np.iinfo('int64').max]
}
- def __init__(self, name, unit='s', tz=None, nullable=True,
+ def __init__(self, name, unit='s', tz=None, *, nullable=True,
metadata=[]):
min_val, max_val = self._ranges[unit]
super(TimestampField, self).__init__(name, True, 64,
@@ -283,13 +284,13 @@ class TimestampField(IntegerField):
class DurationIntervalField(IntegerField):
- def __init__(self, name, unit='s', nullable=True,
+ def __init__(self, name, unit='s', *, nullable=True,
metadata=[]):
min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max,
super(DurationIntervalField, self).__init__(
- name, True, 64,
- nullable=nullable, metadata=metadata,
- min_value=min_val, max_value=max_val)
+ name, True, 64,
+ nullable=nullable, metadata=metadata,
+ min_value=min_val, max_value=max_val)
self.unit = unit
def _get_type(self):
@@ -302,12 +303,12 @@ class DurationIntervalField(IntegerField):
class YearMonthIntervalField(IntegerField):
- def __init__(self, name, nullable=True, metadata=[]):
+ def __init__(self, name, *, nullable=True, metadata=[]):
min_val, max_val = [-10000*12, 10000*12] # +/- 10000 years.
super(YearMonthIntervalField, self).__init__(
- name, True, 32,
- nullable=nullable, metadata=metadata,
- min_value=min_val, max_value=max_val)
+ name, True, 32,
+ nullable=nullable, metadata=metadata,
+ min_value=min_val, max_value=max_val)
def _get_type(self):
fields = [
@@ -319,7 +320,7 @@ class YearMonthIntervalField(IntegerField):
class DayTimeIntervalField(PrimitiveField):
- def __init__(self, name, nullable=True, metadata=[]):
+ def __init__(self, name, *, nullable=True, metadata=[]):
super(DayTimeIntervalField, self).__init__(name,
nullable=True,
metadata=metadata)
@@ -349,7 +350,7 @@ class DayTimeIntervalField(PrimitiveField):
class FloatingPointField(PrimitiveField):
- def __init__(self, name, bit_width, nullable=True,
+ def __init__(self, name, bit_width, *, nullable=True,
metadata=[]):
super(FloatingPointField, self).__init__(name,
nullable=nullable,
@@ -401,8 +402,8 @@ def decimal_range_from_precision(precision):
class DecimalField(PrimitiveField):
- def __init__(self, name, precision, scale, bit_width=128, nullable=True,
- metadata=[]):
+ def __init__(self, name, precision, scale, bit_width=128, *,
+ nullable=True, metadata=[]):
super(DecimalField, self).__init__(name, nullable=True,
metadata=metadata)
self.precision = precision
@@ -458,7 +459,13 @@ class BooleanField(PrimitiveField):
return PrimitiveColumn(name, size, is_valid, values)
-class BinaryField(PrimitiveField):
+class FixedSizeBinaryField(PrimitiveField):
+
+ def __init__(self, name, byte_width, *, nullable=True,
+ metadata=[]):
+ super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
+ metadata=metadata)
+ self.byte_width = byte_width
@property
def numpy_type(self):
@@ -466,37 +473,25 @@ class BinaryField(PrimitiveField):
@property
def column_class(self):
- return BinaryColumn
+ return FixedSizeBinaryColumn
def _get_type(self):
- return OrderedDict([('name', 'binary')])
+ return OrderedDict([('name', 'fixedsizebinary'),
+ ('byteWidth', self.byte_width)])
def generate_column(self, size, name=None):
- K = 7
is_valid = self._make_is_valid(size)
values = []
for i in range(size):
- if is_valid[i]:
- draw = (np.random.randint(0, 255, size=K)
- .astype(np.uint8)
- .tostring())
- values.append(draw)
- else:
- values.append(b"")
+ values.append(random_bytes(self.byte_width))
if name is None:
name = self.name
return self.column_class(name, size, is_valid, values)
-class FixedSizeBinaryField(PrimitiveField):
-
- def __init__(self, name, byte_width, nullable=True,
- metadata=[]):
- super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
- metadata=metadata)
- self.byte_width = byte_width
+class BinaryField(PrimitiveField):
@property
def numpy_type(self):
@@ -504,29 +499,25 @@ class FixedSizeBinaryField(PrimitiveField):
@property
def column_class(self):
- return FixedSizeBinaryColumn
+ return BinaryColumn
def _get_type(self):
- return OrderedDict([('name', 'fixedsizebinary'),
- ('byteWidth', self.byte_width)])
+ return OrderedDict([('name', 'binary')])
- def _get_type_layout(self):
- return OrderedDict([
- ('vectors',
- [OrderedDict([('type', 'VALIDITY'),
- ('typeBitWidth', 1)]),
- OrderedDict([('type', 'DATA'),
- ('typeBitWidth', self.byte_width)])])])
+ def _random_sizes(self, size):
+ return np.random.exponential(scale=4, size=size).astype(np.int32)
def generate_column(self, size, name=None):
is_valid = self._make_is_valid(size)
values = []
- for i in range(size):
- draw = (np.random.randint(0, 255, size=self.byte_width)
- .astype(np.uint8)
- .tostring())
- values.append(draw)
+ sizes = self._random_sizes(size)
+
+ for i, nbytes in enumerate(sizes):
+ if is_valid[i]:
+ values.append(random_bytes(nbytes))
+ else:
+ values.append(b"")
if name is None:
name = self.name
@@ -549,7 +540,7 @@ class StringField(BinaryField):
for i in range(size):
if is_valid[i]:
- values.append(tobytes(rands(K)))
+ values.append(tobytes(random_utf8(K)))
else:
values.append(b"")
@@ -558,6 +549,26 @@ class StringField(BinaryField):
return self.column_class(name, size, is_valid, values)
+class LargeBinaryField(BinaryField):
+
+ @property
+ def column_class(self):
+ return LargeBinaryColumn
+
+ def _get_type(self):
+ return OrderedDict([('name', 'largebinary')])
+
+
+class LargeStringField(StringField):
+
+ @property
+ def column_class(self):
+ return LargeStringColumn
+
+ def _get_type(self):
+ return OrderedDict([('name', 'largeutf8')])
+
+
class Schema(object):
def __init__(self, fields, metadata=None):
@@ -575,7 +586,21 @@ class Schema(object):
return OrderedDict(entries)
-class BinaryColumn(PrimitiveColumn):
+class _NarrowOffsetsMixin:
+
+ def _encode_offsets(self, offsets):
+ return list(map(int, offsets))
+
+
+class _LargeOffsetsMixin:
+
+ def _encode_offsets(self, offsets):
+ # 64-bit offsets have to be represented as strings to roundtrip
+ # through JSON.
+ return list(map(str, offsets))
+
+
+class _BaseBinaryColumn(PrimitiveColumn):
def _encode_value(self, x):
return frombytes(binascii.hexlify(x).upper())
@@ -596,15 +621,37 @@ class BinaryColumn(PrimitiveColumn):
return [
('VALIDITY', [int(x) for x in self.is_valid]),
- ('OFFSET', offsets),
+ ('OFFSET', self._encode_offsets(offsets)),
('DATA', data)
]
+class _BaseStringColumn(_BaseBinaryColumn):
+
+ def _encode_value(self, x):
+ return frombytes(x)
+
+
+class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin):
+ pass
+
+
+class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin):
+ pass
+
+
+class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin):
+ pass
+
+
+class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
+ pass
+
+
class FixedSizeBinaryColumn(PrimitiveColumn):
def _encode_value(self, x):
- return ''.join('{:02x}'.format(c).upper() for c in x)
+ return frombytes(binascii.hexlify(x).upper())
def _get_buffers(self):
data = []
@@ -617,20 +664,18 @@ class FixedSizeBinaryColumn(PrimitiveColumn):
]
-class StringColumn(BinaryColumn):
-
- def _encode_value(self, x):
- return frombytes(x)
-
-
class ListField(Field):
- def __init__(self, name, value_field, nullable=True,
+ def __init__(self, name, value_field, *, nullable=True,
metadata=[]):
super(ListField, self).__init__(name, nullable=nullable,
metadata=metadata)
self.value_field = value_field
+ @property
+ def column_class(self):
+ return ListColumn
+
def _get_type(self):
return OrderedDict([
('name', 'list')
@@ -657,13 +702,25 @@ class ListField(Field):
if name is None:
name = self.name
- return ListColumn(name, size, is_valid, offsets, values)
+ return self.column_class(name, size, is_valid, offsets, values)
+
+class LargeListField(ListField):
-class ListColumn(Column):
+ @property
+ def column_class(self):
+ return LargeListColumn
+
+ def _get_type(self):
+ return OrderedDict([
+ ('name', 'largelist')
+ ])
+
+
+class _BaseListColumn(Column):
def __init__(self, name, count, is_valid, offsets, values):
- super(ListColumn, self).__init__(name, count)
+ super().__init__(name, count)
self.is_valid = is_valid
self.offsets = offsets
self.values = values
@@ -671,31 +728,39 @@ class ListColumn(Column):
def _get_buffers(self):
return [
('VALIDITY', [int(v) for v in self.is_valid]),
- ('OFFSET', list(self.offsets))
+ ('OFFSET', self._encode_offsets(self.offsets))
]
def _get_children(self):
return [self.values.get_json()]
+class ListColumn(_BaseListColumn, _NarrowOffsetsMixin):
+ pass
+
+
+class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
+ pass
+
+
class MapField(Field):
- def __init__(self, name, key_field, item_field, nullable=True,
- metadata=[], keysSorted=False):
+ def __init__(self, name, key_field, item_field, *, nullable=True,
+ metadata=[], keys_sorted=False):
super(MapField, self).__init__(name, nullable=nullable,
metadata=metadata)
assert not key_field.nullable
self.key_field = key_field
self.item_field = item_field
- self.pair_field = StructField(
- 'entries', [key_field, item_field], False)
- self.keysSorted = keysSorted
+ self.pair_field = StructField('entries', [key_field, item_field],
+ nullable=False)
+ self.keys_sorted = keys_sorted
def _get_type(self):
return OrderedDict([
('name', 'map'),
- ('keysSorted', self.keysSorted)
+ ('keysSorted', self.keys_sorted)
])
def _get_children(self):
@@ -742,7 +807,7 @@ class MapColumn(Column):
class FixedSizeListField(Field):
- def __init__(self, name, value_field, list_size, nullable=True,
+ def __init__(self, name, value_field, list_size, *, nullable=True,
metadata=[]):
super(FixedSizeListField, self).__init__(name, nullable=nullable,
metadata=metadata)
@@ -785,7 +850,7 @@ class FixedSizeListColumn(Column):
class StructField(Field):
- def __init__(self, name, fields, nullable=True,
+ def __init__(self, name, fields, *, nullable=True,
metadata=[]):
super(StructField, self).__init__(name, nullable=nullable,
metadata=metadata)
@@ -829,7 +894,7 @@ class Dictionary(object):
class DictionaryField(Field):
- def __init__(self, name, index_field, dictionary, nullable=True,
+ def __init__(self, name, index_field, dictionary, *, nullable=True,
metadata=[]):
super(DictionaryField, self).__init__(name, nullable=nullable,
metadata=metadata)
@@ -935,6 +1000,10 @@ def get_field(name, type_, **kwargs):
return BinaryField(name, **kwargs)
elif type_ == 'utf8':
return StringField(name, **kwargs)
+ elif type_ == 'largebinary':
+ return LargeBinaryField(name, **kwargs)
+ elif type_ == 'largeutf8':
+ return LargeStringField(name, **kwargs)
elif type_.startswith('fixedsizebinary_'):
byte_width = int(type_.split('_')[1])
return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs)
@@ -1020,6 +1089,18 @@ def generate_primitive_case(batch_sizes,
name='primitive'):
return _generate_file(name, fields, batch_sizes)
+def generate_primitive_large_offsets_case(batch_sizes):
+ types = ['largebinary', 'largeutf8']
+
+ fields = []
+
+ for type_ in types:
+ fields.append(get_field(type_ + "_nullable", type_, nullable=True))
+ fields.append(get_field(type_ + "_nonnullable", type_, nullable=False))
+
+ return _generate_file('primitive_large_offsets', fields, batch_sizes)
+
+
def generate_null_case(batch_sizes):
# Interleave null with non-null types to ensure the appropriate number of
# buffers (0) is read and written
@@ -1106,15 +1187,42 @@ def generate_nested_case():
get_field('item', 'int32'), 4),
StructField('struct_nullable', [get_field('f1', 'int32'),
get_field('f2', 'utf8')]),
-
- # TODO(wesm): this causes segfault
- # ListField('list_nonnullable', get_field('item', 'int32'), False),
+ # Fails on Go (ARROW-8452)
+ # ListField('list_nonnullable', get_field('item', 'int32'),
+ # nullable=False),
]
batch_sizes = [7, 10]
return _generate_file("nested", fields, batch_sizes)
+def generate_recursive_nested_case():
+ fields = [
+ ListField('lists_list',
+ ListField('inner_list', get_field('item', 'int16'))),
+ ListField('structs_list',
+ StructField('inner_struct',
+ [get_field('f1', 'int32'),
+ get_field('f2', 'utf8')])),
+ ]
+
+ batch_sizes = [7, 10]
+ return _generate_file("recursive_nested", fields, batch_sizes)
+
+
+def generate_nested_large_offsets_case():
+ fields = [
+ LargeListField('large_list_nullable', get_field('item', 'int32')),
+ LargeListField('large_list_nonnullable',
+ get_field('item', 'int32'), nullable=False),
+ LargeListField('large_list_nested',
+ ListField('inner_list', get_field('item', 'int16'))),
+ ]
+
+ batch_sizes = [0, 13]
+ return _generate_file("nested_large_offsets", fields, batch_sizes)
+
+
def generate_dictionary_case():
dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0')
dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
@@ -1140,9 +1248,9 @@ def generate_nested_dictionary_case():
dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1')
struct_of_dict = StructField('struct', [
- DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
- DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
- ])
+ DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
+ DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
+ ])
dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2')
fields = [
@@ -1166,6 +1274,11 @@ def get_generated_json_files(tempdir=None, flight=False):
generate_primitive_case([17, 20], name='primitive'),
generate_primitive_case([0, 0, 0], name='primitive_zerolength'),
+ generate_primitive_large_offsets_case([17, 20])
+ .skip_category('Go')
+ .skip_category('Java') # TODO(ARROW-6110)
+ .skip_category('JS'),
+
generate_null_case([10, 0])
.skip_category('JS') # TODO(ARROW-7900)
.skip_category('Go'), # TODO(ARROW-7901)
@@ -1187,6 +1300,14 @@ def get_generated_json_files(tempdir=None, flight=False):
generate_nested_case(),
+ # TODO(ARROW-8453)
+ generate_recursive_nested_case().skip_category('Go'),
+
+ generate_nested_large_offsets_case()
+ .skip_category('Go')
+ .skip_category('Java') # TODO(ARROW-6111)
+ .skip_category('JS'),
+
generate_custom_metadata_case().skip_category('Go')
.skip_category('Java')
.skip_category('JS'),
diff --git a/dev/archery/archery/integration/util.py
b/dev/archery/archery/integration/util.py
index e3f2542..a4c4982 100644
--- a/dev/archery/archery/integration/util.py
+++ b/dev/archery/archery/integration/util.py
@@ -18,8 +18,8 @@
import contextlib
import io
import os
+import random
import socket
-import string
import subprocess
import sys
import threading
@@ -32,9 +32,6 @@ def guid():
return uuid.uuid4().hex
-RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
- dtype=(np.str_, 1))
-
# SKIP categories
SKIP_ARROW = 'arrow'
SKIP_FLIGHT = 'flight'
@@ -100,14 +97,26 @@ printer = _Printer()
log = printer.print
-def rands(nchars):
+_RAND_CHARS = np.array(list("abcdefghijklmnop123456Ârrôwµ£°€矢"), dtype="U")
+
+
+def random_utf8(nchars):
"""
- Generate one random byte string.
+ Generate one random UTF8 string.
+ """
+ return ''.join(np.random.choice(_RAND_CHARS, nchars))
- See `rands_array` if you want to create an array of random strings.
+def random_bytes(nbytes):
+ """
+ Generate one random binary string.
"""
- return ''.join(np.random.choice(RANDS_CHARS, nchars))
+ # NOTE getrandbits(0) fails
+ if nbytes > 0:
+ return random.getrandbits(nbytes * 8).to_bytes(nbytes,
+ byteorder='little')
+ else:
+ return b""
def tobytes(o):