[arrow] branch master updated: ARROW-8450: [Integration][C++] Implement large offsets types

bkietz Wed, 15 Apr 2020 09:48:27 -0700

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 3e3712a  ARROW-8450: [Integration][C++] Implement large offsets types
3e3712a is described below

commit 3e3712a14a3242d70145fb9d3d6f0f4b8c374e68
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Apr 15 12:47:54 2020 -0400

    ARROW-8450: [Integration][C++] Implement large offsets types
    
    Implement integration tests for LargeList, LargeBinary and LargeString 
types.
    Enable them for C++ (only).
    
    Also add tests for recursive nested types.
    
    Closes #6934 from pitrou/ARROW-8450-integ-large-offset-types
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/ipc/json_integration_test.cc |  22 +--
 cpp/src/arrow/ipc/json_internal.cc         |  62 +++++--
 cpp/src/arrow/ipc/json_test.cc             |  16 +-
 dev/archery/archery/cli.py                 |   8 +-
 dev/archery/archery/integration/datagen.py | 283 ++++++++++++++++++++---------
 dev/archery/archery/integration/util.py    |  25 ++-
 6 files changed, 272 insertions(+), 144 deletions(-)

diff --git a/cpp/src/arrow/ipc/json_integration_test.cc 
b/cpp/src/arrow/ipc/json_integration_test.cc
index f2eea29..4185cb3 100644
--- a/cpp/src/arrow/ipc/json_integration_test.cc
+++ b/cpp/src/arrow/ipc/json_integration_test.cc
@@ -250,24 +250,12 @@ static const char* JSON_EXAMPLE = R"example(
       {
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        }
+        "nullable": true, "children": []
       },
       {
         "name": "bar",
         "type": {"name": "floatingpoint", "precision": "DOUBLE"},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 64}
-          ]
-        }
+        "nullable": true, "children": []
       }
     ]
   },
@@ -318,12 +306,6 @@ static const char* JSON_EXAMPLE2 = R"example(
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
         "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        },
         "metadata": [
           {"key": "converted_from_time32", "value": "true"}
         ]
diff --git a/cpp/src/arrow/ipc/json_internal.cc 
b/cpp/src/arrow/ipc/json_internal.cc
index 133681c..3dfae2a 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -38,8 +38,10 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/formatting.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/parsing.h"
 #include "arrow/util/string.h"
 #include "arrow/visitor_inline.h"
 
@@ -335,10 +337,8 @@ class SchemaWriter {
   Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
   Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
   Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); 
}
-  Status Visit(const LargeStringType& type) { return 
WriteVarBytes("large_utf8", type); }
-  Status Visit(const LargeBinaryType& type) {
-    return WriteVarBytes("large_binary", type);
-  }
+  Status Visit(const LargeStringType& type) { return 
WriteVarBytes("largeutf8", type); }
+  Status Visit(const LargeBinaryType& type) { return 
WriteVarBytes("largebinary", type); }
   Status Visit(const FixedSizeBinaryType& type) {
     return WritePrimitive("fixedsizebinary", type);
   }
@@ -358,7 +358,7 @@ class SchemaWriter {
   }
 
   Status Visit(const LargeListType& type) {
-    WriteName("large_list", type);
+    WriteName("largelist", type);
     return Status::OK();
   }
 
@@ -525,8 +525,21 @@ class ArrayWriter {
   void WriteIntegerField(const char* name, const T* values, int64_t length) {
     writer_->Key(name);
     writer_->StartArray();
-    for (int i = 0; i < length; ++i) {
-      writer_->Int64(values[i]);
+    if (sizeof(T) < sizeof(int64_t)) {
+      for (int i = 0; i < length; ++i) {
+        writer_->Int64(values[i]);
+      }
+    } else {
+      // Represent 64-bit integers as strings, as JSON numbers cannot represent
+      // them exactly.
+      ::arrow::internal::StringFormatter<typename CTypeTraits<T>::ArrowType> 
formatter;
+      auto append = [this](util::string_view v) {
+        writer_->String(v.data(), static_cast<rj::SizeType>(v.size()));
+        return Status::OK();
+      };
+      for (int i = 0; i < length; ++i) {
+        DCHECK_OK(formatter(values[i], append));
+      }
     }
     writer_->EndArray();
   }
@@ -932,9 +945,9 @@ static Status GetType(const RjObject& json_type,
     *type = utf8();
   } else if (type_name == "binary") {
     *type = binary();
-  } else if (type_name == "large_utf8") {
+  } else if (type_name == "largeutf8") {
     *type = large_utf8();
-  } else if (type_name == "large_binary") {
+  } else if (type_name == "largebinary") {
     *type = large_binary();
   } else if (type_name == "fixedsizebinary") {
     return GetFixedSizeBinary(json_type, type);
@@ -957,7 +970,7 @@ static Status GetType(const RjObject& json_type,
       return Status::Invalid("List must have exactly one child");
     }
     *type = list(children[0]);
-  } else if (type_name == "large_list") {
+  } else if (type_name == "largelist") {
     if (children.size() != 1) {
       return Status::Invalid("Large list must have exactly one child");
     }
@@ -1299,13 +1312,28 @@ class ArrayReader {
     ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(length * sizeof(T), 
pool_));
 
     T* values = reinterpret_cast<T*>(buffer->mutable_data());
-    for (int i = 0; i < length; ++i) {
-      const rj::Value& val = json_array[i];
-      DCHECK(val.IsInt() || val.IsInt64());
-      if (val.IsInt()) {
-        values[i] = static_cast<T>(val.GetInt());
-      } else {
-        values[i] = static_cast<T>(val.GetInt64());
+    if (sizeof(T) < sizeof(int64_t)) {
+      for (int i = 0; i < length; ++i) {
+        const rj::Value& val = json_array[i];
+        DCHECK(val.IsInt() || val.IsInt64());
+        if (val.IsInt()) {
+          values[i] = static_cast<T>(val.GetInt());
+        } else {
+          values[i] = static_cast<T>(val.GetInt64());
+        }
+      }
+    } else {
+      // Read 64-bit integers as strings, as JSON numbers cannot represent
+      // them exactly.
+      ::arrow::internal::StringConverter<typename CTypeTraits<T>::ArrowType> 
converter;
+      for (int i = 0; i < length; ++i) {
+        const rj::Value& val = json_array[i];
+        DCHECK(val.IsString());
+        if (!converter(val.GetString(), val.GetStringLength(), &values[i])) {
+          return Status::Invalid("Failed to parse integer: '",
+                                 std::string(val.GetString(), 
val.GetStringLength()),
+                                 "'");
+        }
       }
     }
 
diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc
index bfc2fab..21a695a 100644
--- a/cpp/src/arrow/ipc/json_test.cc
+++ b/cpp/src/arrow/ipc/json_test.cc
@@ -337,24 +337,12 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) {
       {
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        }
+        "nullable": true, "children": []
       },
       {
         "name": "bar",
         "type": {"name": "floatingpoint", "precision": "DOUBLE"},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 64}
-          ]
-        }
+        "nullable": true, "children": []
       }
     ]
   },
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 6c07e05..c8af1b2 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -346,7 +346,7 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, 
cmake_extras,
         logger.debug(f"Running benchmark {rev_or_path}")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_base = BenchmarkRunner.from_rev_or_path(
             src, root, rev_or_path, conf)
@@ -399,7 +399,7 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, 
cmake_extras,
         logger.debug(f"Running benchmark {rev_or_path}")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_base = BenchmarkRunner.from_rev_or_path(
             src, root, rev_or_path, conf,
@@ -497,7 +497,7 @@ def benchmark_diff(ctx, src, preserve, output, cmake_extras,
                      f"{baseline} (baseline)")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_cont = BenchmarkRunner.from_rev_or_path(
             src, root, contender, conf,
@@ -551,7 +551,7 @@ def _set_default(opt, default):
 @click.option('stop_on_error', '-x', '--stop-on-error',
               is_flag=True, default=False,
               help='Stop on first error')
[email protected]('--gold_dirs', multiple=True,
[email protected]('--gold-dirs', multiple=True,
               help="gold integration test file paths")
 @click.option('-k', '--match',
               help=("Substring for test names to include in run, "
diff --git a/dev/archery/archery/integration/datagen.py 
b/dev/archery/archery/integration/datagen.py
index 0f81acc..3c15abc 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -24,12 +24,13 @@ import tempfile
 
 import numpy as np
 
-from .util import (frombytes, rands, tobytes, SKIP_ARROW, SKIP_FLIGHT)
+from .util import (frombytes, tobytes, random_bytes, random_utf8,
+                   SKIP_ARROW, SKIP_FLIGHT)
 
 
 class Field(object):
 
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         self.name = name
         self.nullable = nullable
         self.metadata = metadata
@@ -132,7 +133,7 @@ TEST_INT_MIN = ~TEST_INT_MAX
 
 class IntegerField(PrimitiveField):
 
-    def __init__(self, name, is_signed, bit_width, nullable=True,
+    def __init__(self, name, is_signed, bit_width, *, nullable=True,
                  metadata=[],
                  min_value=TEST_INT_MIN,
                  max_value=TEST_INT_MAX):
@@ -188,7 +189,7 @@ class DateField(IntegerField):
         MILLISECOND: [-62135596800000, 253402214400000]
     }
 
-    def __init__(self, name, unit, nullable=True, metadata=[]):
+    def __init__(self, name, unit, *, nullable=True, metadata=[]):
         bit_width = 32 if unit == self.DAY else 64
 
         min_value, max_value = self._ranges[unit]
@@ -230,7 +231,7 @@ class TimeField(IntegerField):
         'ns': [0, 86400000000000]
     }
 
-    def __init__(self, name, unit='s', nullable=True,
+    def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
         super(TimeField, self).__init__(name, True, self.BIT_WIDTHS[unit],
@@ -258,7 +259,7 @@ class TimestampField(IntegerField):
         'ns': [np.iinfo('int64').min, np.iinfo('int64').max]
     }
 
-    def __init__(self, name, unit='s', tz=None, nullable=True,
+    def __init__(self, name, unit='s', tz=None, *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
         super(TimestampField, self).__init__(name, True, 64,
@@ -283,13 +284,13 @@ class TimestampField(IntegerField):
 
 class DurationIntervalField(IntegerField):
 
-    def __init__(self, name, unit='s', nullable=True,
+    def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max,
         super(DurationIntervalField, self).__init__(
-                name, True, 64,
-                nullable=nullable, metadata=metadata,
-                min_value=min_val, max_value=max_val)
+            name, True, 64,
+            nullable=nullable, metadata=metadata,
+            min_value=min_val, max_value=max_val)
         self.unit = unit
 
     def _get_type(self):
@@ -302,12 +303,12 @@ class DurationIntervalField(IntegerField):
 
 
 class YearMonthIntervalField(IntegerField):
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         min_val, max_val = [-10000*12, 10000*12]  # +/- 10000 years.
         super(YearMonthIntervalField, self).__init__(
-                name, True, 32,
-                nullable=nullable, metadata=metadata,
-                min_value=min_val, max_value=max_val)
+            name, True, 32,
+            nullable=nullable, metadata=metadata,
+            min_value=min_val, max_value=max_val)
 
     def _get_type(self):
         fields = [
@@ -319,7 +320,7 @@ class YearMonthIntervalField(IntegerField):
 
 
 class DayTimeIntervalField(PrimitiveField):
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         super(DayTimeIntervalField, self).__init__(name,
                                                    nullable=True,
                                                    metadata=metadata)
@@ -349,7 +350,7 @@ class DayTimeIntervalField(PrimitiveField):
 
 class FloatingPointField(PrimitiveField):
 
-    def __init__(self, name, bit_width, nullable=True,
+    def __init__(self, name, bit_width, *, nullable=True,
                  metadata=[]):
         super(FloatingPointField, self).__init__(name,
                                                  nullable=nullable,
@@ -401,8 +402,8 @@ def decimal_range_from_precision(precision):
 
 
 class DecimalField(PrimitiveField):
-    def __init__(self, name, precision, scale, bit_width=128, nullable=True,
-                 metadata=[]):
+    def __init__(self, name, precision, scale, bit_width=128, *,
+                 nullable=True, metadata=[]):
         super(DecimalField, self).__init__(name, nullable=True,
                                            metadata=metadata)
         self.precision = precision
@@ -458,7 +459,13 @@ class BooleanField(PrimitiveField):
         return PrimitiveColumn(name, size, is_valid, values)
 
 
-class BinaryField(PrimitiveField):
+class FixedSizeBinaryField(PrimitiveField):
+
+    def __init__(self, name, byte_width, *, nullable=True,
+                 metadata=[]):
+        super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
+                                                   metadata=metadata)
+        self.byte_width = byte_width
 
     @property
     def numpy_type(self):
@@ -466,37 +473,25 @@ class BinaryField(PrimitiveField):
 
     @property
     def column_class(self):
-        return BinaryColumn
+        return FixedSizeBinaryColumn
 
     def _get_type(self):
-        return OrderedDict([('name', 'binary')])
+        return OrderedDict([('name', 'fixedsizebinary'),
+                            ('byteWidth', self.byte_width)])
 
     def generate_column(self, size, name=None):
-        K = 7
         is_valid = self._make_is_valid(size)
         values = []
 
         for i in range(size):
-            if is_valid[i]:
-                draw = (np.random.randint(0, 255, size=K)
-                        .astype(np.uint8)
-                        .tostring())
-                values.append(draw)
-            else:
-                values.append(b"")
+            values.append(random_bytes(self.byte_width))
 
         if name is None:
             name = self.name
         return self.column_class(name, size, is_valid, values)
 
 
-class FixedSizeBinaryField(PrimitiveField):
-
-    def __init__(self, name, byte_width, nullable=True,
-                 metadata=[]):
-        super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
-                                                   metadata=metadata)
-        self.byte_width = byte_width
+class BinaryField(PrimitiveField):
 
     @property
     def numpy_type(self):
@@ -504,29 +499,25 @@ class FixedSizeBinaryField(PrimitiveField):
 
     @property
     def column_class(self):
-        return FixedSizeBinaryColumn
+        return BinaryColumn
 
     def _get_type(self):
-        return OrderedDict([('name', 'fixedsizebinary'),
-                            ('byteWidth', self.byte_width)])
+        return OrderedDict([('name', 'binary')])
 
-    def _get_type_layout(self):
-        return OrderedDict([
-            ('vectors',
-             [OrderedDict([('type', 'VALIDITY'),
-                           ('typeBitWidth', 1)]),
-              OrderedDict([('type', 'DATA'),
-                           ('typeBitWidth', self.byte_width)])])])
+    def _random_sizes(self, size):
+        return np.random.exponential(scale=4, size=size).astype(np.int32)
 
     def generate_column(self, size, name=None):
         is_valid = self._make_is_valid(size)
         values = []
 
-        for i in range(size):
-            draw = (np.random.randint(0, 255, size=self.byte_width)
-                    .astype(np.uint8)
-                    .tostring())
-            values.append(draw)
+        sizes = self._random_sizes(size)
+
+        for i, nbytes in enumerate(sizes):
+            if is_valid[i]:
+                values.append(random_bytes(nbytes))
+            else:
+                values.append(b"")
 
         if name is None:
             name = self.name
@@ -549,7 +540,7 @@ class StringField(BinaryField):
 
         for i in range(size):
             if is_valid[i]:
-                values.append(tobytes(rands(K)))
+                values.append(tobytes(random_utf8(K)))
             else:
                 values.append(b"")
 
@@ -558,6 +549,26 @@ class StringField(BinaryField):
         return self.column_class(name, size, is_valid, values)
 
 
+class LargeBinaryField(BinaryField):
+
+    @property
+    def column_class(self):
+        return LargeBinaryColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'largebinary')])
+
+
+class LargeStringField(StringField):
+
+    @property
+    def column_class(self):
+        return LargeStringColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'largeutf8')])
+
+
 class Schema(object):
 
     def __init__(self, fields, metadata=None):
@@ -575,7 +586,21 @@ class Schema(object):
         return OrderedDict(entries)
 
 
-class BinaryColumn(PrimitiveColumn):
+class _NarrowOffsetsMixin:
+
+    def _encode_offsets(self, offsets):
+        return list(map(int, offsets))
+
+
+class _LargeOffsetsMixin:
+
+    def _encode_offsets(self, offsets):
+        # 64-bit offsets have to be represented as strings to roundtrip
+        # through JSON.
+        return list(map(str, offsets))
+
+
+class _BaseBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
         return frombytes(binascii.hexlify(x).upper())
@@ -596,15 +621,37 @@ class BinaryColumn(PrimitiveColumn):
 
         return [
             ('VALIDITY', [int(x) for x in self.is_valid]),
-            ('OFFSET', offsets),
+            ('OFFSET', self._encode_offsets(offsets)),
             ('DATA', data)
         ]
 
 
+class _BaseStringColumn(_BaseBinaryColumn):
+
+    def _encode_value(self, x):
+        return frombytes(x)
+
+
+class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin):
+    pass
+
+
+class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
+    pass
+
+
 class FixedSizeBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
-        return ''.join('{:02x}'.format(c).upper() for c in x)
+        return frombytes(binascii.hexlify(x).upper())
 
     def _get_buffers(self):
         data = []
@@ -617,20 +664,18 @@ class FixedSizeBinaryColumn(PrimitiveColumn):
         ]
 
 
-class StringColumn(BinaryColumn):
-
-    def _encode_value(self, x):
-        return frombytes(x)
-
-
 class ListField(Field):
 
-    def __init__(self, name, value_field, nullable=True,
+    def __init__(self, name, value_field, *, nullable=True,
                  metadata=[]):
         super(ListField, self).__init__(name, nullable=nullable,
                                         metadata=metadata)
         self.value_field = value_field
 
+    @property
+    def column_class(self):
+        return ListColumn
+
     def _get_type(self):
         return OrderedDict([
             ('name', 'list')
@@ -657,13 +702,25 @@ class ListField(Field):
 
         if name is None:
             name = self.name
-        return ListColumn(name, size, is_valid, offsets, values)
+        return self.column_class(name, size, is_valid, offsets, values)
+
 
+class LargeListField(ListField):
 
-class ListColumn(Column):
+    @property
+    def column_class(self):
+        return LargeListColumn
+
+    def _get_type(self):
+        return OrderedDict([
+            ('name', 'largelist')
+        ])
+
+
+class _BaseListColumn(Column):
 
     def __init__(self, name, count, is_valid, offsets, values):
-        super(ListColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.offsets = offsets
         self.values = values
@@ -671,31 +728,39 @@ class ListColumn(Column):
     def _get_buffers(self):
         return [
             ('VALIDITY', [int(v) for v in self.is_valid]),
-            ('OFFSET', list(self.offsets))
+            ('OFFSET', self._encode_offsets(self.offsets))
         ]
 
     def _get_children(self):
         return [self.values.get_json()]
 
 
+class ListColumn(_BaseListColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
+    pass
+
+
 class MapField(Field):
 
-    def __init__(self, name, key_field, item_field, nullable=True,
-                 metadata=[], keysSorted=False):
+    def __init__(self, name, key_field, item_field, *, nullable=True,
+                 metadata=[], keys_sorted=False):
         super(MapField, self).__init__(name, nullable=nullable,
                                        metadata=metadata)
 
         assert not key_field.nullable
         self.key_field = key_field
         self.item_field = item_field
-        self.pair_field = StructField(
-            'entries', [key_field, item_field], False)
-        self.keysSorted = keysSorted
+        self.pair_field = StructField('entries', [key_field, item_field],
+                                      nullable=False)
+        self.keys_sorted = keys_sorted
 
     def _get_type(self):
         return OrderedDict([
             ('name', 'map'),
-            ('keysSorted', self.keysSorted)
+            ('keysSorted', self.keys_sorted)
         ])
 
     def _get_children(self):
@@ -742,7 +807,7 @@ class MapColumn(Column):
 
 class FixedSizeListField(Field):
 
-    def __init__(self, name, value_field, list_size, nullable=True,
+    def __init__(self, name, value_field, list_size, *, nullable=True,
                  metadata=[]):
         super(FixedSizeListField, self).__init__(name, nullable=nullable,
                                                  metadata=metadata)
@@ -785,7 +850,7 @@ class FixedSizeListColumn(Column):
 
 class StructField(Field):
 
-    def __init__(self, name, fields, nullable=True,
+    def __init__(self, name, fields, *, nullable=True,
                  metadata=[]):
         super(StructField, self).__init__(name, nullable=nullable,
                                           metadata=metadata)
@@ -829,7 +894,7 @@ class Dictionary(object):
 
 class DictionaryField(Field):
 
-    def __init__(self, name, index_field, dictionary, nullable=True,
+    def __init__(self, name, index_field, dictionary, *, nullable=True,
                  metadata=[]):
         super(DictionaryField, self).__init__(name, nullable=nullable,
                                               metadata=metadata)
@@ -935,6 +1000,10 @@ def get_field(name, type_, **kwargs):
         return BinaryField(name, **kwargs)
     elif type_ == 'utf8':
         return StringField(name, **kwargs)
+    elif type_ == 'largebinary':
+        return LargeBinaryField(name, **kwargs)
+    elif type_ == 'largeutf8':
+        return LargeStringField(name, **kwargs)
     elif type_.startswith('fixedsizebinary_'):
         byte_width = int(type_.split('_')[1])
         return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs)
@@ -1020,6 +1089,18 @@ def generate_primitive_case(batch_sizes, 
name='primitive'):
     return _generate_file(name, fields, batch_sizes)
 
 
+def generate_primitive_large_offsets_case(batch_sizes):
+    types = ['largebinary', 'largeutf8']
+
+    fields = []
+
+    for type_ in types:
+        fields.append(get_field(type_ + "_nullable", type_, nullable=True))
+        fields.append(get_field(type_ + "_nonnullable", type_, nullable=False))
+
+    return _generate_file('primitive_large_offsets', fields, batch_sizes)
+
+
 def generate_null_case(batch_sizes):
     # Interleave null with non-null types to ensure the appropriate number of
     # buffers (0) is read and written
@@ -1106,15 +1187,42 @@ def generate_nested_case():
                            get_field('item', 'int32'), 4),
         StructField('struct_nullable', [get_field('f1', 'int32'),
                                         get_field('f2', 'utf8')]),
-
-        # TODO(wesm): this causes segfault
-        # ListField('list_nonnullable', get_field('item', 'int32'), False),
+        # Fails on Go (ARROW-8452)
+        # ListField('list_nonnullable', get_field('item', 'int32'),
+        #           nullable=False),
     ]
 
     batch_sizes = [7, 10]
     return _generate_file("nested", fields, batch_sizes)
 
 
+def generate_recursive_nested_case():
+    fields = [
+        ListField('lists_list',
+                  ListField('inner_list', get_field('item', 'int16'))),
+        ListField('structs_list',
+                  StructField('inner_struct',
+                              [get_field('f1', 'int32'),
+                               get_field('f2', 'utf8')])),
+    ]
+
+    batch_sizes = [7, 10]
+    return _generate_file("recursive_nested", fields, batch_sizes)
+
+
+def generate_nested_large_offsets_case():
+    fields = [
+        LargeListField('large_list_nullable', get_field('item', 'int32')),
+        LargeListField('large_list_nonnullable',
+                       get_field('item', 'int32'), nullable=False),
+        LargeListField('large_list_nested',
+                       ListField('inner_list', get_field('item', 'int16'))),
+    ]
+
+    batch_sizes = [0, 13]
+    return _generate_file("nested_large_offsets", fields, batch_sizes)
+
+
 def generate_dictionary_case():
     dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0')
     dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
@@ -1140,9 +1248,9 @@ def generate_nested_dictionary_case():
     dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1')
 
     struct_of_dict = StructField('struct', [
-            DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
-            DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
-        ])
+        DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
+        DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
+    ])
     dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2')
 
     fields = [
@@ -1166,6 +1274,11 @@ def get_generated_json_files(tempdir=None, flight=False):
         generate_primitive_case([17, 20], name='primitive'),
         generate_primitive_case([0, 0, 0], name='primitive_zerolength'),
 
+        generate_primitive_large_offsets_case([17, 20])
+        .skip_category('Go')
+        .skip_category('Java')  # TODO(ARROW-6110)
+        .skip_category('JS'),
+
         generate_null_case([10, 0])
         .skip_category('JS')   # TODO(ARROW-7900)
         .skip_category('Go'),  # TODO(ARROW-7901)
@@ -1187,6 +1300,14 @@ def get_generated_json_files(tempdir=None, flight=False):
 
         generate_nested_case(),
 
+        # TODO(ARROW-8453)
+        generate_recursive_nested_case().skip_category('Go'),
+
+        generate_nested_large_offsets_case()
+        .skip_category('Go')
+        .skip_category('Java')  # TODO(ARROW-6111)
+        .skip_category('JS'),
+
         generate_custom_metadata_case().skip_category('Go')
                                        .skip_category('Java')
                                        .skip_category('JS'),
diff --git a/dev/archery/archery/integration/util.py 
b/dev/archery/archery/integration/util.py
index e3f2542..a4c4982 100644
--- a/dev/archery/archery/integration/util.py
+++ b/dev/archery/archery/integration/util.py
@@ -18,8 +18,8 @@
 import contextlib
 import io
 import os
+import random
 import socket
-import string
 import subprocess
 import sys
 import threading
@@ -32,9 +32,6 @@ def guid():
     return uuid.uuid4().hex
 
 
-RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
-                       dtype=(np.str_, 1))
-
 # SKIP categories
 SKIP_ARROW = 'arrow'
 SKIP_FLIGHT = 'flight'
@@ -100,14 +97,26 @@ printer = _Printer()
 log = printer.print
 
 
-def rands(nchars):
+_RAND_CHARS = np.array(list("abcdefghijklmnop123456Ârrôwµ£°€矢"), dtype="U")
+
+
+def random_utf8(nchars):
     """
-    Generate one random byte string.
+    Generate one random UTF8 string.
+    """
+    return ''.join(np.random.choice(_RAND_CHARS, nchars))
 
-    See `rands_array` if you want to create an array of random strings.
 
+def random_bytes(nbytes):
+    """
+    Generate one random binary string.
     """
-    return ''.join(np.random.choice(RANDS_CHARS, nchars))
+    # NOTE getrandbits(0) fails
+    if nbytes > 0:
+        return random.getrandbits(nbytes * 8).to_bytes(nbytes,
+                                                       byteorder='little')
+    else:
+        return b""
 
 
 def tobytes(o):

[arrow] branch master updated: ARROW-8450: [Integration][C++] Implement large offsets types

Reply via email to