mathyingzhou commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r567596202
########## File path: cpp/src/arrow/adapters/orc/adapter_test.cc ########## @@ -157,4 +197,1930 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); } } + +// WriteORC tests + +// Trivial +TEST(TestAdapterWriteTrivial, writeZeroRowsNoConversion) { + std::shared_ptr<Table> table = TableFromJSON( + schema({field("bool", boolean()), field("int8", int8()), field("int16", int16()), + field("int32", int32()), field("int64", int64()), field("float", float32()), + field("double", float64()), field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("binary", binary()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("list", list(int32())), + field("lsl", list(struct_({field("lsl0", list(int32()))})))}), + {R"([])"}); + AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); +} +TEST(TestAdapterWriteTrivial, writeChunklessNoConversion) { + std::shared_ptr<Table> table = TableFromJSON( + schema({field("bool", boolean()), field("int8", int8()), field("int16", int16()), + field("int32", int32()), field("int64", int64()), field("float", float32()), + field("double", float64()), field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("binary", binary()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("list", list(int32())), + field("lsl", list(struct_({field("lsl0", list(int32()))})))}), + {}); + AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); +} +TEST(TestAdapterWriteTrivial, writeZeroRowsWithConversion) { + std::shared_ptr<Table> + input_table = TableFromJSON( + schema({field("date64", date64()), field("ts0", timestamp(TimeUnit::SECOND)), + field("ts1", timestamp(TimeUnit::MILLI)), + field("ts2", timestamp(TimeUnit::MICRO)), + field("large_string", large_utf8()), + field("large_binary", large_binary()), + field("fixed_size_binary0", fixed_size_binary(0)), + field("fixed_size_binary", fixed_size_binary(5)), + field("large_list", large_list(int32())), + field("fixed_size_list", fixed_size_list(int32(), 3)), + field("map", map(utf8(), utf8()))}), + {R"([])"}), + expected_output_table = TableFromJSON( + schema({field("date64", timestamp(TimeUnit::NANO)), + field("ts0", timestamp(TimeUnit::NANO)), + field("ts1", timestamp(TimeUnit::NANO)), + field("ts2", timestamp(TimeUnit::NANO)), field("large_string", utf8()), + field("large_binary", binary()), field("fixed_size_binary0", binary()), + field("fixed_size_binary", binary()), + field("large_list", list(int32())), + field("fixed_size_list", list(int32())), + field("map", + list(struct_({field("key", utf8()), field("value", utf8())})))}), + {R"([])"}); + AssertTableWriteReadEqual(input_table, expected_output_table, + kDefaultSmallMemStreamSize / 16); +} +TEST(TestAdapterWriteTrivial, writeChunklessWithConversion) { + std::shared_ptr<Table> + input_table = TableFromJSON( + schema({field("date64", date64()), field("ts0", timestamp(TimeUnit::SECOND)), + field("ts1", timestamp(TimeUnit::MILLI)), + field("ts2", timestamp(TimeUnit::MICRO)), + field("large_string", large_utf8()), + field("large_binary", large_binary()), + field("fixed_size_binary0", fixed_size_binary(0)), + field("fixed_size_binary", fixed_size_binary(5)), + field("large_list", large_list(int32())), + field("fixed_size_list", fixed_size_list(int32(), 3)), + field("map", map(utf8(), utf8()))}), + {}), + expected_output_table = TableFromJSON( + schema({field("date64", timestamp(TimeUnit::NANO)), + field("ts0", timestamp(TimeUnit::NANO)), + field("ts1", timestamp(TimeUnit::NANO)), + field("ts2", timestamp(TimeUnit::NANO)), field("large_string", utf8()), + field("large_binary", binary()), field("fixed_size_binary0", binary()), + field("fixed_size_binary", binary()), + field("large_list", list(int32())), + field("fixed_size_list", list(int32())), + field("map", + list(struct_({field("key", utf8()), field("value", utf8())})))}), + {}); + AssertTableWriteReadEqual(input_table, expected_output_table, + kDefaultSmallMemStreamSize / 16); +} + +// General +TEST(TestAdapterWriteGeneral, writeAllNullsNew) { + std::vector<std::shared_ptr<Field>> table_fields{ + field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("decimal128nz", decimal(33, 4)), + field("decimal128z", decimal(35, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> table_schema = schema(table_fields); + arrow::random::RandomArrayGenerator rand(kRandomSeed); + + int64_t num_rows = 10000; + int64_t numCols = table_fields.size(); + + ArrayMatrix arrays(numCols, ArrayVector(5, NULLPTR)); + for (int i = 0; i < numCols; i++) { + for (int j = 0; j < 5; j++) { + int row_count = j % 2 ? 0 : num_rows / 2; + arrays[i][j] = rand.ArrayOf(table_fields[i]->type(), row_count, 1); + } + } + + ChunkedArrayVector cv; + cv.reserve(numCols); + + for (int col = 0; col < numCols; col++) { + cv.push_back(std::make_shared<ChunkedArray>(arrays[col])); + } + + std::shared_ptr<Table> table = Table::Make(table_schema, cv); + AssertTableWriteReadEqual(table, table); +} + +TEST(TestAdapterWriteGeneral, writeAllNulls) { + std::vector<std::shared_ptr<Field>> table_fields{ + field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("decimal128nz", decimal(33, 4)), + field("decimal128z", decimal(35, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> table_schema = std::make_shared<Schema>(table_fields); + + int64_t num_rows = 10000; + int64_t numCols = table_fields.size(); + + ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR)); + + for (int i = 0; i < 5; i++) { + builders[0][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(33, 4))); + builders[6][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(35, 0))); Review comment: Yup. It is not supported in ORC now so we can't. I can file an ORC ticket though. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org