Repository: arrow Updated Branches: refs/heads/master dc6cefde4 -> 3add9181f
ARROW-476: Add binary integration test fixture, add Java support @julienledem could you review my Java changes? Thanks Author: Wes McKinney <[email protected]> Closes #326 from wesm/ARROW-476 and squashes the following commits: a75228d [Wes McKinney] Use PoolBuffer instead of std::vector e5a96a0 [Wes McKinney] Chain exceptions b23b852 [Wes McKinney] Use hexadecimal for transporting binary data in JSON 1d4e850 [Wes McKinney] Compare byte[] with Arrays.equals e5f13d5 [Wes McKinney] Add binary integration test fixture, add to JsonFileReader.java, but fails Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/3add9181 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/3add9181 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/3add9181 Branch: refs/heads/master Commit: 3add9181f98810bcfeae558bf44093d9ab89bc3f Parents: dc6cefd Author: Wes McKinney <[email protected]> Authored: Thu Feb 9 10:45:35 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Thu Feb 9 10:45:35 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/ipc/json-internal.cc | 55 ++++++++++++++++++-- integration/integration_test.py | 55 +++++++++++++++++--- java/vector/pom.xml | 5 ++ .../arrow/vector/file/json/JsonFileReader.java | 14 +++++ .../arrow/vector/file/json/JsonFileWriter.java | 6 +++ .../org/apache/arrow/vector/util/Validator.java | 4 ++ 6 files changed, 129 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/cpp/src/arrow/ipc/json-internal.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 1a95b2c..b9f97dd 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -17,7 +17,10 @@ #include "arrow/ipc/json-internal.h" +#include <algorithm> #include <cstdint> +#include <cstdlib> +#include <iostream> #include <memory> #include <sstream> #include <string> @@ -40,6 +43,8 @@ namespace arrow { namespace ipc { +static const char* kAsciiTable = "0123456789ABCDEF"; + using RjArray = rj::Value::ConstArray; using RjObject = rj::Value::ConstObject; @@ -395,14 +400,26 @@ class JsonArrayWriter : public ArrayVisitor { } } - // String (Utf8), Binary + // Binary, encode to hexadecimal. UTF8 string write as is template <typename T> typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type WriteDataValues(const T& arr) { for (int i = 0; i < arr.length(); ++i) { int32_t length; const char* buf = reinterpret_cast<const char*>(arr.GetValue(i, &length)); - writer_->String(buf, length); + + if (std::is_base_of<StringArray, T>::value) { + writer_->String(buf, length); + } else { + std::string hex_string; + hex_string.reserve(length * 2); + for (int32_t j = 0; j < length; ++j) { + // Convert to 2 base16 digits + hex_string.push_back(kAsciiTable[buf[j] >> 4]); + hex_string.push_back(kAsciiTable[buf[j] & 15]); + } + writer_->String(hex_string); + } } } @@ -773,6 +790,20 @@ class JsonSchemaReader { const rj::Value& json_schema_; }; +static inline Status ParseHexValue(const char* data, uint8_t* out) { + char c1 = data[0]; + char c2 = data[1]; + + const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1); + const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2); + + // Error checking + if (*pos1 != c1 || *pos2 != c2) { return Status::Invalid("Encountered non-hex digit"); } + + *out = (pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable); + return Status::OK(); +} + class JsonArrayReader { public: explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {} @@ -852,6 +883,8 @@ class JsonArrayReader { const auto& json_data_arr = json_data->value.GetArray(); DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length); + + auto byte_buffer = std::make_shared<PoolBuffer>(pool_); for (int i = 0; i < length; ++i) { if (!is_valid[i]) { builder.AppendNull(); @@ -860,7 +893,23 @@ class JsonArrayReader { const rj::Value& val = json_data_arr[i]; DCHECK(val.IsString()); - builder.Append(val.GetString()); + if (std::is_base_of<StringType, T>::value) { + builder.Append(val.GetString()); + } else { + std::string hex_string = val.GetString(); + + DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string"; + int64_t length = static_cast<int>(hex_string.size()) / 2; + + if (byte_buffer->size() < length) { RETURN_NOT_OK(byte_buffer->Resize(length)); } + + const char* hex_data = hex_string.c_str(); + uint8_t* byte_buffer_data = byte_buffer->mutable_data(); + for (int64_t j = 0; j < length; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j])); + } + RETURN_NOT_OK(builder.Append(byte_buffer_data, length)); + } } return builder.Finish(array); http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/integration/integration_test.py ---------------------------------------------------------------------- diff --git a/integration/integration_test.py b/integration/integration_test.py index a622bf2..1d8dc29 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -241,14 +241,18 @@ class BooleanType(PrimitiveType): return PrimitiveColumn(self.name, size, is_valid, values) -class StringType(PrimitiveType): +class BinaryType(PrimitiveType): @property def numpy_type(self): return object + @property + def column_class(self): + return BinaryColumn + def _get_type(self): - return OrderedDict([('name', 'utf8')]) + return OrderedDict([('name', 'binary')]) def _get_type_layout(self): return OrderedDict([ @@ -267,11 +271,37 @@ class StringType(PrimitiveType): for i in range(size): if is_valid[i]: + draw = (np.random.randint(0, 255, size=K) + .astype(np.uint8) + .tostring()) + values.append(draw) + else: + values.append("") + + return self.column_class(self.name, size, is_valid, values) + + +class StringType(BinaryType): + + @property + def column_class(self): + return StringColumn + + def _get_type(self): + return OrderedDict([('name', 'utf8')]) + + def generate_column(self, size): + K = 7 + is_valid = self._make_is_valid(size) + values = [] + + for i in range(size): + if is_valid[i]: values.append(rands(K)) else: values.append("") - return StringColumn(self.name, size, is_valid, values) + return self.column_class(self.name, size, is_valid, values) class JSONSchema(object): @@ -285,7 +315,10 @@ class JSONSchema(object): ]) -class StringColumn(PrimitiveColumn): +class BinaryColumn(PrimitiveColumn): + + def _encode_value(self, x): + return ''.join('{:02x}'.format(c).upper() for c in x) def _get_buffers(self): offset = 0 @@ -299,7 +332,7 @@ class StringColumn(PrimitiveColumn): v = "" offsets.append(offset) - data.append(v) + data.append(self._encode_value(v)) return [ ('VALIDITY', [int(x) for x in self.is_valid]), @@ -308,6 +341,12 @@ class StringColumn(PrimitiveColumn): ] +class StringColumn(BinaryColumn): + + def _encode_value(self, x): + return x + + class ListType(DataType): def __init__(self, name, value_type, nullable=True): @@ -443,7 +482,9 @@ class JSONFile(object): def get_field(name, type_, nullable=True): - if type_ == 'utf8': + if type_ == 'binary': + return BinaryType(name, nullable=nullable) + elif type_ == 'utf8': return StringType(name, nullable=nullable) dtype = np.dtype(type_) @@ -463,7 +504,7 @@ def get_field(name, type_, nullable=True): def generate_primitive_case(): types = ['bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float32', 'float64', 'utf8'] + 'float32', 'float64', 'binary', 'utf8'] fields = [] http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/pom.xml ---------------------------------------------------------------------- diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 64b68bf..8517d4c 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -56,6 +56,11 @@ <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>1.10</version> + </dependency> </dependencies> <pluginRepositories> http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 71fe88e..24fdc18 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -48,6 +48,7 @@ import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Mutator; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.NullableMapVector; @@ -60,6 +61,8 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.MappingJsonFactory; import com.google.common.base.Objects; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Hex; public class JsonFileReader implements AutoCloseable { private final File inputFile; @@ -164,6 +167,14 @@ public class JsonFileReader implements AutoCloseable { readToken(END_OBJECT); } + private byte[] decodeHexSafe(String hexString) throws IOException { + try { + return Hex.decodeHex(hexString.toCharArray()); + } catch (DecoderException e) { + throw new IOException("Unable to decode hex string: " + hexString, e); + } + } + private void setValueFromParser(ValueVector valueVector, int i) throws IOException { switch (valueVector.getMinorType()) { case BIT: @@ -199,6 +210,9 @@ public class JsonFileReader implements AutoCloseable { case FLOAT8: ((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class)); break; + case VARBINARY: + ((VarBinaryVector)valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class))); + break; case VARCHAR: ((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); break; http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index ddc8043..99040b6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -30,6 +30,7 @@ import org.apache.arrow.vector.TimeStampMicroVector; import org.apache.arrow.vector.TimeStampNanoVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.schema.ArrowVectorType; import org.apache.arrow.vector.types.pojo.Field; @@ -40,6 +41,7 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter; import com.fasterxml.jackson.databind.MappingJsonFactory; +import org.apache.commons.codec.binary.Hex; public class JsonFileWriter implements AutoCloseable { @@ -157,6 +159,10 @@ public class JsonFileWriter implements AutoCloseable { case BIT: generator.writeNumber(((BitVector)valueVector).getAccessor().get(i)); break; + case VARBINARY: + String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i)); + generator.writeObject(hexString); + break; default: // TODO: each type Accessor accessor = valueVector.getAccessor(); http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java ---------------------------------------------------------------------- diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java index a974582..f294e20 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java @@ -17,6 +17,7 @@ */ package org.apache.arrow.vector.util; +import java.util.Arrays; import java.util.List; import org.apache.arrow.vector.FieldVector; @@ -89,7 +90,10 @@ public class Validator { default: throw new UnsupportedOperationException("unsupported precision: " + fpType); } + } else if (type instanceof ArrowType.Binary) { + return Arrays.equals((byte[]) o1, (byte[]) o2); } + return Objects.equal(o1, o2); }
