Repository: arrow Updated Branches: refs/heads/master a1237a28f -> 7e9576426 (forced update)
ARROW-1513: C++: Add cast from Dictionary to plain arrays Author: Wes McKinney <wes.mckin...@twosigma.com> Author: Uwe L. Korn <uw...@xhochy.com> Closes #1086 from xhochy/ARROW-1513 and squashes the following commits: 769b205e [Wes McKinney] Fix compiler error / warning. Change type checks to DCHECK 05c539f2 [Uwe L. Korn] ARROW-1513: C++: Add cast from Dictionary to plain arrays Change-Id: I80d444e7a0661d58909801004ac9818dece04a15 Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7e957642 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7e957642 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7e957642 Branch: refs/heads/master Commit: 7e95764263c34a3bfabf0ca9909f425c16042141 Parents: 94f6247 Author: Uwe L. Korn <uw...@xhochy.com> Authored: Mon Sep 11 13:11:37 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Mon Sep 11 13:12:24 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/compute/cast.cc | 344 +++++++++++++++++++++++++---- cpp/src/arrow/compute/compute-test.cc | 26 ++- cpp/src/arrow/table-test.cc | 66 +++--- cpp/src/arrow/test-common.h | 66 +++++- cpp/src/arrow/type.h | 5 + cpp/src/arrow/util/bit-util.h | 6 +- 6 files changed, 429 insertions(+), 84 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/cast.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 3885fdf..c651244 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -28,6 +28,8 @@ #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/compare.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" @@ -37,6 +39,32 @@ #include "arrow/compute/context.h" #include "arrow/compute/kernel.h" +#ifdef ARROW_EXTRA_ERROR_CONTEXT + +#define FUNC_RETURN_NOT_OK(s) \ + do { \ + Status _s = (s); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + std::stringstream ss; \ + ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ + ctx->SetStatus(Status(_s.code(), ss.str())); \ + return; \ + } \ + } while (0) + +#else + +#define FUNC_RETURN_NOT_OK(s) \ + do { \ + Status _s = (s); \ + if (ARROW_PREDICT_FALSE(!_s.ok())) { \ + ctx->SetStatus(_s); \ + return; \ + } \ + } while (0) + +#endif // ARROW_EXTRA_ERROR_CONTEXT + namespace arrow { namespace compute { @@ -226,19 +254,219 @@ struct CastFunctor<O, I, }; // ---------------------------------------------------------------------- +// Dictionary to other things + +template <typename IndexType> +void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, + const FixedSizeBinaryArray& dictionary, + ArrayData* output) { + using index_c_type = typename IndexType::c_type; + const uint8_t* valid_bits = indices.null_bitmap_data(); + INIT_BITSET(valid_bits, indices.offset()); + + const index_c_type* in = + reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) + + indices.offset(); + uint8_t* out = output->buffers[1]->mutable_data(); + int32_t byte_width = + static_cast<const FixedSizeBinaryType&>(*output->type).byte_width(); + for (int64_t i = 0; i < indices.length(); ++i) { + if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + const uint8_t* value = dictionary.Value(in[i]); + memcpy(out + i * byte_width, value, byte_width); + } + READ_NEXT_BITSET(valid_bits); + } +} + +template <typename T> +struct CastFunctor< + T, DictionaryType, + typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, + ArrayData* output) { + const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input); + const DictionaryType& type = static_cast<const DictionaryType&>(*input.type()); + const DataType& values_type = *type.dictionary()->type(); + const FixedSizeBinaryArray& dictionary = + static_cast<const FixedSizeBinaryArray&>(*type.dictionary()); + + // Check if values and output type match + DCHECK(values_type.Equals(*output->type)) + << "Dictionary type: " << values_type + << " target type: " << (*output->type); + + const Array& indices = *dict_array.indices(); + switch (indices.type()->id()) { + case Type::INT8: + UnpackFixedSizeBinaryDictionary<Int8Type>(ctx, indices, dictionary, output); + break; + case Type::INT16: + UnpackFixedSizeBinaryDictionary<Int16Type>(ctx, indices, dictionary, output); + break; + case Type::INT32: + UnpackFixedSizeBinaryDictionary<Int32Type>(ctx, indices, dictionary, output); + break; + case Type::INT64: + UnpackFixedSizeBinaryDictionary<Int64Type>(ctx, indices, dictionary, output); + break; + default: + std::stringstream ss; + ss << "Invalid index type: " << indices.type()->ToString(); + ctx->SetStatus(Status::Invalid(ss.str())); + return; + } + } +}; + +template <typename IndexType> +Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, + const BinaryArray& dictionary, ArrayData* output) { + using index_c_type = typename IndexType::c_type; + std::unique_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder)); + BinaryBuilder* binary_builder = static_cast<BinaryBuilder*>(builder.get()); + + const uint8_t* valid_bits = indices.null_bitmap_data(); + INIT_BITSET(valid_bits, indices.offset()); + + const index_c_type* in = + reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) + + indices.offset(); + for (int64_t i = 0; i < indices.length(); ++i) { + if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + int32_t length; + const uint8_t* value = dictionary.GetValue(in[i], &length); + RETURN_NOT_OK(binary_builder->Append(value, length)); + } else { + RETURN_NOT_OK(binary_builder->AppendNull()); + } + READ_NEXT_BITSET(valid_bits); + } + + std::shared_ptr<Array> plain_array; + RETURN_NOT_OK(binary_builder->Finish(&plain_array)); + // Copy all buffer except the valid bitmap + for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) { + output->buffers.push_back(plain_array->data()->buffers[i]); + } + + return Status::OK(); +} + +template <typename T> +struct CastFunctor<T, DictionaryType, + typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, + ArrayData* output) { + const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input); + const DictionaryType& type = static_cast<const DictionaryType&>(*input.type()); + const DataType& values_type = *type.dictionary()->type(); + const BinaryArray& dictionary = static_cast<const BinaryArray&>(*type.dictionary()); + + // Check if values and output type match + DCHECK(values_type.Equals(*output->type)) + << "Dictionary type: " << values_type + << " target type: " << (*output->type); + + const Array& indices = *dict_array.indices(); + switch (indices.type()->id()) { + case Type::INT8: + FUNC_RETURN_NOT_OK( + (UnpackBinaryDictionary<Int8Type>(ctx, indices, dictionary, output))); + break; + case Type::INT16: + FUNC_RETURN_NOT_OK( + (UnpackBinaryDictionary<Int16Type>(ctx, indices, dictionary, output))); + break; + case Type::INT32: + FUNC_RETURN_NOT_OK( + (UnpackBinaryDictionary<Int32Type>(ctx, indices, dictionary, output))); + break; + case Type::INT64: + FUNC_RETURN_NOT_OK( + (UnpackBinaryDictionary<Int64Type>(ctx, indices, dictionary, output))); + break; + default: + std::stringstream ss; + ss << "Invalid index type: " << indices.type()->ToString(); + ctx->SetStatus(Status::Invalid(ss.str())); + return; + } + } +}; + +template <typename IndexType, typename c_type> +void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, + c_type* out) { + using index_c_type = typename IndexType::c_type; + + const uint8_t* valid_bits = indices.null_bitmap_data(); + INIT_BITSET(valid_bits, indices.offset()); + + const index_c_type* in = + reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) + + indices.offset(); + for (int64_t i = 0; i < indices.length(); ++i) { + if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + out[i] = dictionary[in[i]]; + } + READ_NEXT_BITSET(valid_bits); + } +} + +// Cast from dictionary to plain representation +template <typename T> +struct CastFunctor<T, DictionaryType, + typename std::enable_if<IsNumeric<T>::value>::type> { + void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input, + ArrayData* output) { + using c_type = typename T::c_type; + + const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input); + const DictionaryType& type = static_cast<const DictionaryType&>(*input.type()); + const DataType& values_type = *type.dictionary()->type(); + + // Check if values and output type match + DCHECK(values_type.Equals(*output->type)) + << "Dictionary type: " << values_type + << " target type: " << (*output->type); + + auto dictionary = + reinterpret_cast<const c_type*>(type.dictionary()->data()->buffers[1]->data()) + + type.dictionary()->offset(); + auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data()); + const Array& indices = *dict_array.indices(); + switch (indices.type()->id()) { + case Type::INT8: + UnpackPrimitiveDictionary<Int8Type, c_type>(indices, dictionary, out); + break; + case Type::INT16: + UnpackPrimitiveDictionary<Int16Type, c_type>(indices, dictionary, out); + break; + case Type::INT32: + UnpackPrimitiveDictionary<Int32Type, c_type>(indices, dictionary, out); + break; + case Type::INT64: + UnpackPrimitiveDictionary<Int64Type, c_type>(indices, dictionary, out); + break; + default: + std::stringstream ss; + ss << "Invalid index type: " << indices.type()->ToString(); + ctx->SetStatus(Status::Invalid(ss.str())); + return; + } + } +}; + +// ---------------------------------------------------------------------- typedef std::function<void(FunctionContext*, const CastOptions& options, const Array&, ArrayData*)> CastFunction; static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input, - ArrayData* out) { - if (!is_primitive(out->type->id())) { - return Status::NotImplemented(out->type->ToString()); - } - - const auto& fw_type = static_cast<const FixedWidthType&>(*out->type); - + bool can_pre_allocate_values, ArrayData* out) { const int64_t length = input.length(); out->null_count = input.null_count(); @@ -261,35 +489,49 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input out->buffers.push_back(validity_bitmap); - std::shared_ptr<Buffer> out_data; + if (can_pre_allocate_values) { + std::shared_ptr<Buffer> out_data; - int bit_width = fw_type.bit_width(); - int64_t buffer_size = 0; + if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) { + std::stringstream ss; + ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); + return Status::NotImplemented(ss.str()); + } - if (bit_width == 1) { - buffer_size = BitUtil::BytesForBits(length); - } else if (bit_width % 8 == 0) { - buffer_size = length * fw_type.bit_width() / 8; - } else { - DCHECK(false); - } + const auto& fw_type = static_cast<const FixedWidthType&>(*out->type); + + int bit_width = fw_type.bit_width(); + int64_t buffer_size = 0; - RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data)); - memset(out_data->mutable_data(), 0, buffer_size); + if (bit_width == 1) { + buffer_size = BitUtil::BytesForBits(length); + } else if (bit_width % 8 == 0) { + buffer_size = length * fw_type.bit_width() / 8; + } else { + DCHECK(false); + } - out->buffers.push_back(out_data); + RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data)); + memset(out_data->mutable_data(), 0, buffer_size); + + out->buffers.push_back(out_data); + } return Status::OK(); } class CastKernel : public UnaryKernel { public: - CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy) - : options_(options), func_(func), is_zero_copy_(is_zero_copy) {} + CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy, + bool can_pre_allocate_values) + : options_(options), + func_(func), + is_zero_copy_(is_zero_copy), + can_pre_allocate_values_(can_pre_allocate_values) {} Status Call(FunctionContext* ctx, const Array& input, ArrayData* out) override { if (!is_zero_copy_) { - RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, out)); + RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, can_pre_allocate_values_, out)); } func_(ctx, options_, input, out); RETURN_IF_ERROR(ctx); @@ -300,11 +542,14 @@ class CastKernel : public UnaryKernel { CastOptions options_; CastFunction func_; bool is_zero_copy_; + bool can_pre_allocate_values_; }; #define CAST_CASE(InType, OutType) \ case OutType::type_id: \ is_zero_copy = is_zero_copy_cast<OutType, InType>::value; \ + can_pre_allocate_values = \ + !(!is_binary_like(InType::type_id) && is_binary_like(OutType::type_id)); \ func = [](FunctionContext* ctx, const CastOptions& options, const Array& input, \ ArrayData* out) { \ CastFunctor<OutType, InType> func; \ @@ -354,20 +599,42 @@ class CastKernel : public UnaryKernel { #define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType); -#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ - static std::unique_ptr<UnaryKernel> Get##InType##CastFunc( \ - const std::shared_ptr<DataType>& out_type, const CastOptions& options) { \ - CastFunction func; \ - bool is_zero_copy = false; \ - switch (out_type->id()) { \ - CASE_GENERATOR(CAST_CASE, InType); \ - default: \ - break; \ - } \ - if (func != nullptr) { \ - return std::unique_ptr<UnaryKernel>(new CastKernel(options, func, is_zero_copy)); \ - } \ - return nullptr; \ +#define DICTIONARY_CASES(FN, IN_TYPE) \ + FN(IN_TYPE, Time32Type); \ + FN(IN_TYPE, Date32Type); \ + FN(IN_TYPE, TimestampType); \ + FN(IN_TYPE, Time64Type); \ + FN(IN_TYPE, Date64Type); \ + FN(IN_TYPE, UInt8Type); \ + FN(IN_TYPE, Int8Type); \ + FN(IN_TYPE, UInt16Type); \ + FN(IN_TYPE, Int16Type); \ + FN(IN_TYPE, UInt32Type); \ + FN(IN_TYPE, Int32Type); \ + FN(IN_TYPE, UInt64Type); \ + FN(IN_TYPE, Int64Type); \ + FN(IN_TYPE, FloatType); \ + FN(IN_TYPE, DoubleType); \ + FN(IN_TYPE, FixedSizeBinaryType); \ + FN(IN_TYPE, BinaryType); \ + FN(IN_TYPE, StringType); + +#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ + static std::unique_ptr<UnaryKernel> Get##InType##CastFunc( \ + const std::shared_ptr<DataType>& out_type, const CastOptions& options) { \ + CastFunction func; \ + bool is_zero_copy = false; \ + bool can_pre_allocate_values = true; \ + switch (out_type->id()) { \ + CASE_GENERATOR(CAST_CASE, InType); \ + default: \ + break; \ + } \ + if (func != nullptr) { \ + return std::unique_ptr<UnaryKernel>( \ + new CastKernel(options, func, is_zero_copy, can_pre_allocate_values)); \ + } \ + return nullptr; \ } GET_CAST_FUNCTION(NULL_CASES, NullType); @@ -388,6 +655,8 @@ GET_CAST_FUNCTION(TIME32_CASES, Time32Type); GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); +GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); + #define CAST_FUNCTION_CASE(InType) \ case InType::type_id: \ *kernel = Get##InType##CastFunc(out_type, options); \ @@ -413,6 +682,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>& CAST_FUNCTION_CASE(Time32Type); CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); + CAST_FUNCTION_CASE(DictionaryType); default: break; } http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/compute-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index ba645c2..5898aee 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -66,10 +66,9 @@ void AssertArraysEqual(const Array& left, const Array& right) { class ComputeFixture { public: - ComputeFixture() : pool_(default_memory_pool()), ctx_(pool_) {} + ComputeFixture() : ctx_(default_memory_pool()) {} protected: - MemoryPool* pool_; FunctionContext ctx_; }; @@ -81,7 +80,7 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i right.data()->buffers[buffer_index].get()); } -class TestCast : public ComputeFixture, public ::testing::Test { +class TestCast : public ComputeFixture, public TestBase { public: void CheckPass(const Array& input, const Array& expected, const std::shared_ptr<DataType>& out_type, const CastOptions& options) { @@ -395,5 +394,26 @@ TEST_F(TestCast, PreallocatedMemory) { AssertArraysEqual(*expected, *result); } +template <typename TestType> +class TestDictionaryCast : public TestCast {}; + +typedef ::testing::Types<UInt8Type, Int8Type, UInt16Type, Int16Type, Int32Type, + UInt32Type, UInt64Type, Int64Type, FloatType, DoubleType, + Date32Type, Date64Type, FixedSizeBinaryType, BinaryType> + TestTypes; + +TYPED_TEST_CASE(TestDictionaryCast, TestTypes); + +TYPED_TEST(TestDictionaryCast, Basic) { + CastOptions options; + std::shared_ptr<Array> plain_array = + TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 2); + + std::shared_ptr<Array> dict_array; + ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array)); + + this->CheckPass(*dict_array, *plain_array, plain_array->type(), options); +} + } // namespace compute } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index a9c7e6d..b0aeed1 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -128,9 +128,9 @@ class TestColumn : public TestChunkedArray { TEST_F(TestColumn, BasicAPI) { ArrayVector arrays; - arrays.push_back(MakePrimitive<Int32Array>(100)); - arrays.push_back(MakePrimitive<Int32Array>(100, 10)); - arrays.push_back(MakePrimitive<Int32Array>(100, 20)); + arrays.push_back(MakeRandomArray<Int32Array>(100)); + arrays.push_back(MakeRandomArray<Int32Array>(100, 10)); + arrays.push_back(MakeRandomArray<Int32Array>(100, 20)); auto f0 = field("c0", int32()); column_.reset(new Column(f0, arrays)); @@ -148,15 +148,15 @@ TEST_F(TestColumn, BasicAPI) { TEST_F(TestColumn, ChunksInhomogeneous) { ArrayVector arrays; - arrays.push_back(MakePrimitive<Int32Array>(100)); - arrays.push_back(MakePrimitive<Int32Array>(100, 10)); + arrays.push_back(MakeRandomArray<Int32Array>(100)); + arrays.push_back(MakeRandomArray<Int32Array>(100, 10)); auto f0 = field("c0", int32()); column_.reset(new Column(f0, arrays)); ASSERT_OK(column_->ValidateData()); - arrays.push_back(MakePrimitive<Int16Array>(100, 10)); + arrays.push_back(MakeRandomArray<Int16Array>(100, 10)); column_.reset(new Column(f0, arrays)); ASSERT_RAISES(Invalid, column_->ValidateData()); } @@ -202,8 +202,8 @@ class TestTable : public TestBase { vector<shared_ptr<Field>> fields = {f0, f1, f2}; schema_ = std::make_shared<Schema>(fields); - arrays_ = {MakePrimitive<Int32Array>(length), MakePrimitive<UInt8Array>(length), - MakePrimitive<Int16Array>(length)}; + arrays_ = {MakeRandomArray<Int32Array>(length), MakeRandomArray<UInt8Array>(length), + MakeRandomArray<Int16Array>(length)}; columns_ = {std::make_shared<Column>(schema_->field(0), arrays_[0]), std::make_shared<Column>(schema_->field(1), arrays_[1]), @@ -276,9 +276,10 @@ TEST_F(TestTable, InvalidColumns) { ASSERT_RAISES(Invalid, table_->ValidateColumns()); columns_ = { - std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)), - std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)), - std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1))}; + std::make_shared<Column>(schema_->field(0), MakeRandomArray<Int32Array>(length)), + std::make_shared<Column>(schema_->field(1), MakeRandomArray<UInt8Array>(length)), + std::make_shared<Column>(schema_->field(2), + MakeRandomArray<Int16Array>(length - 1))}; table_.reset(new Table(schema_, columns_, length)); ASSERT_RAISES(Invalid, table_->ValidateColumns()); @@ -300,9 +301,12 @@ TEST_F(TestTable, Equals) { ASSERT_FALSE(table_->Equals(Table(other_schema, columns_))); // Differing columns std::vector<std::shared_ptr<Column>> other_columns = { - std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)), - std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)), - std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))}; + std::make_shared<Column>(schema_->field(0), + MakeRandomArray<Int32Array>(length, 10)), + std::make_shared<Column>(schema_->field(1), + MakeRandomArray<UInt8Array>(length, 10)), + std::make_shared<Column>(schema_->field(2), + MakeRandomArray<Int16Array>(length, 10))}; ASSERT_FALSE(table_->Equals(Table(schema_, other_columns))); } @@ -410,8 +414,8 @@ TEST_F(TestTable, AddColumn) { ASSERT_TRUE(status.IsInvalid()); // Add column with wrong length - auto longer_col = - std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length + 1)); + auto longer_col = std::make_shared<Column>(schema_->field(0), + MakeRandomArray<Int32Array>(length + 1)); status = table.AddColumn(0, longer_col, &result); ASSERT_TRUE(status.IsInvalid()); @@ -445,8 +449,8 @@ TEST_F(TestTable, AddColumn) { TEST_F(TestTable, IsChunked) { ArrayVector c1, c2; - auto a1 = MakePrimitive<Int32Array>(10); - auto a2 = MakePrimitive<Int32Array>(20); + auto a1 = MakeRandomArray<Int32Array>(10); + auto a2 = MakeRandomArray<Int32Array>(20); auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())}); @@ -477,9 +481,9 @@ TEST_F(TestRecordBatch, Equals) { vector<shared_ptr<Field>> fields = {f0, f1, f2}; auto schema = std::make_shared<Schema>(fields); - auto a0 = MakePrimitive<Int32Array>(length); - auto a1 = MakePrimitive<UInt8Array>(length); - auto a2 = MakePrimitive<Int16Array>(length); + auto a0 = MakeRandomArray<Int32Array>(length); + auto a1 = MakeRandomArray<UInt8Array>(length); + auto a2 = MakeRandomArray<Int16Array>(length); RecordBatch b1(schema, length, {a0, a1, a2}); RecordBatch b3(schema, length, {a0, a1}); @@ -502,10 +506,10 @@ TEST_F(TestRecordBatch, Validate) { auto schema = ::arrow::schema({f0, f1, f2}); - auto a0 = MakePrimitive<Int32Array>(length); - auto a1 = MakePrimitive<UInt8Array>(length); - auto a2 = MakePrimitive<Int16Array>(length); - auto a3 = MakePrimitive<Int16Array>(5); + auto a0 = MakeRandomArray<Int32Array>(length); + auto a1 = MakeRandomArray<UInt8Array>(length); + auto a2 = MakeRandomArray<Int16Array>(length); + auto a3 = MakeRandomArray<Int16Array>(5); RecordBatch b1(schema, length, {a0, a1, a2}); @@ -531,8 +535,8 @@ TEST_F(TestRecordBatch, Slice) { vector<shared_ptr<Field>> fields = {f0, f1}; auto schema = std::make_shared<Schema>(fields); - auto a0 = MakePrimitive<Int32Array>(length); - auto a1 = MakePrimitive<UInt8Array>(length); + auto a0 = MakeRandomArray<Int32Array>(length); + auto a1 = MakeRandomArray<UInt8Array>(length); RecordBatch batch(schema, length, {a0, a1}); @@ -555,10 +559,10 @@ class TestTableBatchReader : public TestBase {}; TEST_F(TestTableBatchReader, ReadNext) { ArrayVector c1, c2; - auto a1 = MakePrimitive<Int32Array>(10); - auto a2 = MakePrimitive<Int32Array>(20); - auto a3 = MakePrimitive<Int32Array>(30); - auto a4 = MakePrimitive<Int32Array>(10); + auto a1 = MakeRandomArray<Int32Array>(10); + auto a2 = MakeRandomArray<Int32Array>(20); + auto a3 = MakeRandomArray<Int32Array>(30); + auto a4 = MakeRandomArray<Int32Array>(10); auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())}); http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h index 4ce0640..3dc39fc 100644 --- a/cpp/src/arrow/test-common.h +++ b/cpp/src/arrow/test-common.h @@ -42,15 +42,7 @@ class TestBase : public ::testing::Test { random_seed_ = 0; } - template <typename ArrayType> - std::shared_ptr<Array> MakePrimitive(int64_t length, int64_t null_count = 0) { - auto data = std::make_shared<PoolBuffer>(pool_); - const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type); - EXPECT_OK(data->Resize(data_nbytes)); - - // Fill with random data - test::random_bytes(data_nbytes, random_seed_++, data->mutable_data()); - + std::shared_ptr<Buffer> MakeRandomNullBitmap(int64_t length, int64_t null_count) { auto null_bitmap = std::make_shared<PoolBuffer>(pool_); const int64_t null_nbytes = BitUtil::BytesForBits(length); EXPECT_OK(null_bitmap->Resize(null_nbytes)); @@ -58,14 +50,68 @@ class TestBase : public ::testing::Test { for (int64_t i = 0; i < null_count; i++) { BitUtil::ClearBit(null_bitmap->mutable_data(), i * (length / null_count)); } - return std::make_shared<ArrayType>(length, data, null_bitmap, null_count); + return null_bitmap; } + template <typename ArrayType> + std::shared_ptr<Array> MakeRandomArray(int64_t length, int64_t null_count = 0); + protected: uint32_t random_seed_; MemoryPool* pool_; }; +template <typename ArrayType> +std::shared_ptr<Array> TestBase::MakeRandomArray(int64_t length, int64_t null_count) { + auto data = std::make_shared<PoolBuffer>(pool_); + const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type); + EXPECT_OK(data->Resize(data_nbytes)); + + // Fill with random data + test::random_bytes(data_nbytes, random_seed_++, data->mutable_data()); + std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count); + + return std::make_shared<ArrayType>(length, data, null_bitmap, null_count); +} + +template <> +std::shared_ptr<Array> TestBase::MakeRandomArray<FixedSizeBinaryArray>( + int64_t length, int64_t null_count) { + const int byte_width = 10; + std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count); + std::shared_ptr<PoolBuffer> data = std::make_shared<PoolBuffer>(pool_); + + EXPECT_OK(data->Resize(byte_width * length)); + ::arrow::test::random_bytes(data->size(), 0, data->mutable_data()); + return std::make_shared<FixedSizeBinaryArray>(fixed_size_binary(byte_width), length, + data, null_bitmap, null_count); +} + +template <> +std::shared_ptr<Array> TestBase::MakeRandomArray<BinaryArray>(int64_t length, + int64_t null_count) { + std::vector<uint8_t> valid_bytes(length, 1); + for (int64_t i = 0; i < null_count; i++) { + valid_bytes[i * 2] = 0; + } + BinaryBuilder builder(pool_); + + const int kBufferSize = 10; + uint8_t buffer[kBufferSize]; + for (int64_t i = 0; i < length; i++) { + if (!valid_bytes[i]) { + EXPECT_OK(builder.AppendNull()); + } else { + ::arrow::test::random_bytes(kBufferSize, static_cast<uint32_t>(i), buffer); + EXPECT_OK(builder.Append(buffer, kBufferSize)); + } + } + + std::shared_ptr<Array> out; + EXPECT_OK(builder.Finish(&out)); + return out; +} + class TestBuilder : public ::testing::Test { public: void SetUp() { http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 283e27e..7630f48 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -169,6 +169,11 @@ class ARROW_EXPORT DataType { ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); }; +inline std::ostream& operator<<(std::ostream& os, const DataType& type) { + os << type.ToString(); + return os; +} + // TODO(wesm): Remove this from parquet-cpp using TypePtr = std::shared_ptr<DataType>; http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/util/bit-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index aa83746..b8a8efa 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -49,9 +49,9 @@ namespace arrow { -#define INIT_BITSET(valid_bits_vector, valid_bits_index) \ - int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \ - int bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \ +#define INIT_BITSET(valid_bits_vector, valid_bits_index) \ + int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \ + int64_t bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \ uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; #define READ_NEXT_BITSET(valid_bits_vector) \