mathyingzhou commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r560636285
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +225,2478 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// WriteORC tests
+
+// General
+TEST(TestAdapterWriteGeneral, writeZeroRows) {
+ std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()),
Review comment:
These tests are mostly amalgamated from the 31,000+ lines of ORC test
that used to exist. Overall if we generate random test data we should be able
to generate random Arrow tables and use them in ORC tests as well.
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +225,2478 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// WriteORC tests
+
+// General
+TEST(TestAdapterWriteGeneral, writeZeroRows) {
+ std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()),
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("float", float32()),
+ field("double", float64()),
+ field("decimal128nz",
decimal(25, 6)),
+ field("decimal128z", decimal(32,
0)),
+ field("date32", date32()),
+ field("ts3",
timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary())};
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+
+ int64_t numRows = 0;
+ int64_t numCols = xFields.size();
+
+ ArrayBuilderVector builders(numCols, NULLPTR);
+ builders[0] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>());
+ builders[1] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>());
+ builders[2] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>());
+ builders[3] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>());
+ builders[4] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>());
+ builders[5] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<FloatBuilder>());
+ builders[6] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<DoubleBuilder>());
+ builders[7] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(25, 6)));
+ builders[8] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(32, 0)));
+ builders[9] =
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>());
+ builders[10] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>(
+ timestamp(TimeUnit::NANO), default_memory_pool()));
+ builders[11] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>());
+ builders[12] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>());
+ ArrayVector arrays(numCols, NULLPTR);
+ ChunkedArrayVector cv;
+ cv.reserve(numCols);
+
+ for (int col = 0; col < numCols; col++) {
+ ARROW_EXPECT_OK(builders[col]->Finish(&arrays[col]));
+ cv.push_back(std::make_shared<ChunkedArray>(arrays[col]));
+ }
+
+ std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv);
+
+ std::unique_ptr<ORCMemWriter> writer =
+ std::unique_ptr<ORCMemWriter>(new ORCMemWriter());
+ std::unique_ptr<liborc::OutputStream> out_stream =
+ std::unique_ptr<liborc::OutputStream>(static_cast<liborc::OutputStream*>(
+ new MemoryOutputStream(DEFAULT_SMALL_MEM_STREAM_SIZE / 16)));
+ ARROW_EXPECT_OK(writer->Open(sharedPtrSchema, out_stream));
+ ARROW_EXPECT_OK(writer->Write(table));
+ auto output_mem_stream =
static_cast<MemoryOutputStream*>(writer->ReleaseOutStream());
+ std::shared_ptr<io::RandomAccessFile> in_stream(
+ new io::BufferReader(std::make_shared<Buffer>(
+ reinterpret_cast<const uint8_t*>(output_mem_stream->getData()),
+ static_cast<int64_t>(output_mem_stream->getLength()))));
+
+ std::unique_ptr<adapters::orc::ORCFileReader> reader;
+ ASSERT_TRUE(
+ adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(),
&reader).ok());
+ std::shared_ptr<Table> outputTable;
+ ARROW_EXPECT_OK(reader->Read(&outputTable));
+ EXPECT_EQ(outputTable->num_columns(), numCols);
+ EXPECT_EQ(outputTable->num_rows(), numRows);
+ EXPECT_TRUE(outputTable->Equals(*table));
+}
+TEST(TestAdapterWriteGeneral, writeChunkless) {
+ std::vector<std::shared_ptr<Field>> xFieldsSub{std::make_shared<Field>("a",
utf8()),
+ std::make_shared<Field>("b",
int32())};
+ std::vector<std::shared_ptr<Field>> xFields{
+ field("bool", boolean()),
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("float", float32()),
+ field("double", float64()),
+ field("decimal128nz", decimal(25, 6)),
+ field("decimal128z", decimal(32, 0)),
+ field("date32", date32()),
+ field("ts3", timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary()),
+ field("struct", struct_(xFieldsSub)),
+ field("list", list(int32())),
+ field("lsl", list(struct_({field("lsl0", list(int32()))})))};
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+
+ int64_t numRows = 0;
+ int64_t numCols = xFields.size();
+
+ ChunkedArrayVector cv;
+ cv.reserve(numCols);
+
+ ArrayMatrix av(numCols, ArrayVector(0, NULLPTR));
+
+ for (int col = 0; col < numCols; col++) {
+ cv.push_back(std::make_shared<ChunkedArray>(av[col],
xFields[col]->type()));
+ }
+
+ std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv);
+
+ MemoryOutputStream mem_stream(DEFAULT_SMALL_MEM_STREAM_SIZE);
+ std::unique_ptr<ORCMemWriter> writer =
+ std::unique_ptr<ORCMemWriter>(new ORCMemWriter());
+ std::unique_ptr<liborc::OutputStream> out_stream =
+ std::unique_ptr<liborc::OutputStream>(static_cast<liborc::OutputStream*>(
+ new MemoryOutputStream(DEFAULT_SMALL_MEM_STREAM_SIZE / 16)));
+ ARROW_EXPECT_OK(writer->Open(sharedPtrSchema, out_stream));
+ ARROW_EXPECT_OK(writer->Write(table));
+ auto output_mem_stream =
static_cast<MemoryOutputStream*>(writer->ReleaseOutStream());
+ std::shared_ptr<io::RandomAccessFile> in_stream(
+ new io::BufferReader(std::make_shared<Buffer>(
+ reinterpret_cast<const uint8_t*>(output_mem_stream->getData()),
+ static_cast<int64_t>(output_mem_stream->getLength()))));
+
+ std::unique_ptr<adapters::orc::ORCFileReader> reader;
+ ASSERT_TRUE(
+ adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(),
&reader).ok());
+ std::shared_ptr<Table> outputTable;
+ ARROW_EXPECT_OK(reader->Read(&outputTable));
+ EXPECT_EQ(outputTable->num_columns(), numCols);
+ EXPECT_EQ(outputTable->num_rows(), numRows);
+ EXPECT_TRUE(outputTable->Equals(*table));
+}
+TEST(TestAdapterWriteGeneral, writeAllNulls) {
+ std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()),
+ field("int8", int8()),
+ field("int16", int16()),
+ field("int32", int32()),
+ field("int64", int64()),
+ field("decimal128nz",
decimal(33, 4)),
+ field("decimal128z", decimal(35,
0)),
+ field("date32", date32()),
+ field("ts3",
timestamp(TimeUnit::NANO)),
+ field("string", utf8()),
+ field("binary", binary())};
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+
+ int64_t numRows = 10000;
+ int64_t numCols = xFields.size();
+
+ ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR));
+
+ for (int i = 0; i < 5; i++) {
+ builders[0][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>());
+ builders[1][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>());
+ builders[2][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>());
+ builders[3][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>());
+ builders[4][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>());
+ builders[5][i] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(33, 4)));
+ builders[6][i] = std::static_pointer_cast<ArrayBuilder>(
+ std::make_shared<Decimal128Builder>(decimal(35, 0)));
+ builders[7][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>());
+ builders[8][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>(
+ timestamp(TimeUnit::NANO), default_memory_pool()));
+ builders[9][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>());
+ builders[10][i] =
+
std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>());
+ }
+
+ for (int i = 0; i < numRows; i++) {
+ int chunk = i < (numRows / 2) ? 1 : 3;
+ for (int col = 0; col < numCols; col++) {
+ ARROW_EXPECT_OK(builders[col][chunk]->AppendNull());
+ }
+ }
+
+ ArrayMatrix arrays(numCols, ArrayVector(5, NULLPTR));
+ ChunkedArrayVector cv;
+ cv.reserve(numCols);
+
+ for (int col = 0; col < numCols; col++) {
+ for (int i = 0; i < 5; i++) {
+ ARROW_EXPECT_OK(builders[col][i]->Finish(&arrays[col][i]));
+ }
+ cv.push_back(std::make_shared<ChunkedArray>(arrays[col]));
+ }
+
+ std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv);
Review comment:
Thanks! Definitely.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]