emkornfield commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r567578953
########## File path: cpp/src/arrow/adapters/orc/adapter_test.cc ########## @@ -157,4 +249,2151 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); } } + +// WriteORC tests + +// General +TEST(TestAdapterWriteGeneral, writeZeroRows) { + std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("double", float64()), + field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numCols = xFields.size(); + + ArrayBuilderVector builders(numCols, NULLPTR); + builders[0] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<FloatBuilder>()); + builders[6] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<DoubleBuilder>()); + builders[7] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(25, 6))); + builders[8] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(32, 0))); + builders[9] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>()); + builders[10] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>( + timestamp(TimeUnit::NANO), default_memory_pool())); + builders[11] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>()); + builders[12] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>()); + ArrayVector arrays(numCols, NULLPTR); + ChunkedArrayVector cv; + cv.reserve(numCols); + + for (int col = 0; col < numCols; col++) { + ARROW_EXPECT_OK(builders[col]->Finish(&arrays[col])); + cv.push_back(std::make_shared<ChunkedArray>(arrays[col])); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); + EXPECT_TRUE(tableWriteReadEqual(table, table, DEFAULT_SMALL_MEM_STREAM_SIZE / 16)); +} +TEST(TestAdapterWriteGeneral, writeChunkless) { + std::vector<std::shared_ptr<Field>> xFieldsSub{std::make_shared<Field>("a", utf8()), + std::make_shared<Field>("b", int32())}; + std::vector<std::shared_ptr<Field>> xFields{ + field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("double", float64()), + field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary()), + field("struct", struct_(xFieldsSub)), + field("list", list(int32())), + field("lsl", list(struct_({field("lsl0", list(int32()))})))}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numCols = xFields.size(); + + ChunkedArrayVector cv; + cv.reserve(numCols); + + ArrayMatrix av(numCols, ArrayVector(0, NULLPTR)); + + for (int col = 0; col < numCols; col++) { + cv.push_back(std::make_shared<ChunkedArray>(av[col], xFields[col]->type())); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); + EXPECT_TRUE(tableWriteReadEqual(table, table, DEFAULT_SMALL_MEM_STREAM_SIZE / 16)); +} +TEST(TestAdapterWriteGeneral, writeAllNulls) { + std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("decimal128nz", decimal(33, 4)), + field("decimal128z", decimal(35, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numRows = 10000; + int64_t numCols = xFields.size(); + + ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR)); + + for (int i = 0; i < 5; i++) { + builders[0][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(33, 4))); + builders[6][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(35, 0))); + builders[7][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>()); + builders[8][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>( + timestamp(TimeUnit::NANO), default_memory_pool())); + builders[9][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>()); + builders[10][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>()); + } + + for (int i = 0; i < numRows; i++) { + int chunk = i < (numRows / 2) ? 1 : 3; + for (int col = 0; col < numCols; col++) { + ARROW_EXPECT_OK(builders[col][chunk]->AppendNull()); + } + } + + ArrayMatrix arrays(numCols, ArrayVector(5, NULLPTR)); + ChunkedArrayVector cv; + cv.reserve(numCols); + + for (int col = 0; col < numCols; col++) { + for (int i = 0; i < 5; i++) { + ARROW_EXPECT_OK(builders[col][i]->Finish(&arrays[col][i])); + } + cv.push_back(std::make_shared<ChunkedArray>(arrays[col])); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); + EXPECT_TRUE(tableWriteReadEqual(table, table)); +} +TEST(TestAdapterWriteGeneral, writeNoNulls) { + std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("decimal128nz", decimal(36, 2)), + field("decimal128z", decimal(31, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numRows = 10000; + int64_t numCols = xFields.size(); + + ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR)); + + for (int i = 0; i < 5; i++) { + builders[0][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(36, 2))); + builders[6][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(31, 0))); + builders[7][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>()); + builders[8][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>( + timestamp(TimeUnit::NANO), default_memory_pool())); + builders[9][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>()); + builders[10][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>()); + } + + char bin[2], string_[13]; + std::string str; + for (int64_t i = 0; i < numRows / 2; i++) { + bin[0] = i % 128; + bin[1] = bin[0]; + str = "Arrow " + std::to_string(2 * i); + snprintf(string_, sizeof(string_), "%s", str.c_str()); + ARROW_EXPECT_OK( + std::static_pointer_cast<BooleanBuilder>(builders[0][1])->Append(true)); + ARROW_EXPECT_OK( + std::static_pointer_cast<Int8Builder>(builders[1][1])->Append(i % 128)); + ARROW_EXPECT_OK(std::static_pointer_cast<Int16Builder>(builders[2][1])->Append(i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Int32Builder>(builders[3][1])->Append(i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Int64Builder>(builders[4][1])->Append(i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Decimal128Builder>(builders[5][1]) + ->Append(Decimal128(std::to_string(i) + ".56"))); + ARROW_EXPECT_OK(std::static_pointer_cast<Decimal128Builder>(builders[6][1]) + ->Append(Decimal128(std::to_string(i)))); + ARROW_EXPECT_OK( + std::static_pointer_cast<Date32Builder>(builders[7][1])->Append(18600 + i)); + ARROW_EXPECT_OK(std::static_pointer_cast<TimestampBuilder>(builders[8][1]) + ->Append(INT64_C(1605547718999999999) + i)); + ARROW_EXPECT_OK( + std::static_pointer_cast<StringBuilder>(builders[9][1])->Append(string_)); + ARROW_EXPECT_OK( + std::static_pointer_cast<BinaryBuilder>(builders[10][1])->Append(bin, 2)); + } + for (int64_t i = numRows / 2; i < numRows; i++) { + bin[0] = i % 256; + bin[1] = (i / 256) % 256; + str = "Arrow " + std::to_string(3 - 4 * i); + snprintf(string_, sizeof(string_), "%s", str.c_str()); + ARROW_EXPECT_OK( + std::static_pointer_cast<BooleanBuilder>(builders[0][3])->Append(false)); + ARROW_EXPECT_OK( + std::static_pointer_cast<Int8Builder>(builders[1][3])->Append(-(i % 128))); + ARROW_EXPECT_OK( + std::static_pointer_cast<Int16Builder>(builders[2][3])->Append(4 - i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Int32Builder>(builders[3][3])->Append(-i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Int64Builder>(builders[4][3])->Append(-i)); + ARROW_EXPECT_OK(std::static_pointer_cast<Decimal128Builder>(builders[5][3]) + ->Append(Decimal128(std::to_string(-i) + ".00"))); + ARROW_EXPECT_OK(std::static_pointer_cast<Decimal128Builder>(builders[6][3]) + ->Append(Decimal128(std::to_string(-i)))); + ARROW_EXPECT_OK( + std::static_pointer_cast<Date32Builder>(builders[7][3])->Append(18600 - i)); + ARROW_EXPECT_OK(std::static_pointer_cast<TimestampBuilder>(builders[8][3]) + ->Append(INT64_C(1605557718999999999) - i)); + ARROW_EXPECT_OK( + std::static_pointer_cast<StringBuilder>(builders[9][3])->Append(string_)); + ARROW_EXPECT_OK( + std::static_pointer_cast<BinaryBuilder>(builders[10][3])->Append(bin, 2)); + } Review comment: I would recommend looking through the Arrow Parquet tests. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org