mathyingzhou commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r561405645
########## File path: cpp/src/arrow/adapters/orc/adapter_test.cc ########## @@ -157,4 +225,2478 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); } } + +// WriteORC tests + +// General +TEST(TestAdapterWriteGeneral, writeZeroRows) { + std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("double", float64()), + field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numRows = 0; + int64_t numCols = xFields.size(); + + ArrayBuilderVector builders(numCols, NULLPTR); + builders[0] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<FloatBuilder>()); + builders[6] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<DoubleBuilder>()); + builders[7] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(25, 6))); + builders[8] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(32, 0))); + builders[9] = std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>()); + builders[10] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>( + timestamp(TimeUnit::NANO), default_memory_pool())); + builders[11] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>()); + builders[12] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>()); + ArrayVector arrays(numCols, NULLPTR); + ChunkedArrayVector cv; + cv.reserve(numCols); + + for (int col = 0; col < numCols; col++) { + ARROW_EXPECT_OK(builders[col]->Finish(&arrays[col])); + cv.push_back(std::make_shared<ChunkedArray>(arrays[col])); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); + + std::unique_ptr<ORCMemWriter> writer = + std::unique_ptr<ORCMemWriter>(new ORCMemWriter()); + std::unique_ptr<liborc::OutputStream> out_stream = + std::unique_ptr<liborc::OutputStream>(static_cast<liborc::OutputStream*>( + new MemoryOutputStream(DEFAULT_SMALL_MEM_STREAM_SIZE / 16))); + ARROW_EXPECT_OK(writer->Open(sharedPtrSchema, out_stream)); + ARROW_EXPECT_OK(writer->Write(table)); + auto output_mem_stream = static_cast<MemoryOutputStream*>(writer->ReleaseOutStream()); + std::shared_ptr<io::RandomAccessFile> in_stream( + new io::BufferReader(std::make_shared<Buffer>( + reinterpret_cast<const uint8_t*>(output_mem_stream->getData()), + static_cast<int64_t>(output_mem_stream->getLength())))); + + std::unique_ptr<adapters::orc::ORCFileReader> reader; + ASSERT_TRUE( + adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader).ok()); + std::shared_ptr<Table> outputTable; + ARROW_EXPECT_OK(reader->Read(&outputTable)); + EXPECT_EQ(outputTable->num_columns(), numCols); + EXPECT_EQ(outputTable->num_rows(), numRows); + EXPECT_TRUE(outputTable->Equals(*table)); +} +TEST(TestAdapterWriteGeneral, writeChunkless) { + std::vector<std::shared_ptr<Field>> xFieldsSub{std::make_shared<Field>("a", utf8()), + std::make_shared<Field>("b", int32())}; + std::vector<std::shared_ptr<Field>> xFields{ + field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("float", float32()), + field("double", float64()), + field("decimal128nz", decimal(25, 6)), + field("decimal128z", decimal(32, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary()), + field("struct", struct_(xFieldsSub)), + field("list", list(int32())), + field("lsl", list(struct_({field("lsl0", list(int32()))})))}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numRows = 0; + int64_t numCols = xFields.size(); + + ChunkedArrayVector cv; + cv.reserve(numCols); + + ArrayMatrix av(numCols, ArrayVector(0, NULLPTR)); + + for (int col = 0; col < numCols; col++) { + cv.push_back(std::make_shared<ChunkedArray>(av[col], xFields[col]->type())); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); + + MemoryOutputStream mem_stream(DEFAULT_SMALL_MEM_STREAM_SIZE); + std::unique_ptr<ORCMemWriter> writer = + std::unique_ptr<ORCMemWriter>(new ORCMemWriter()); + std::unique_ptr<liborc::OutputStream> out_stream = + std::unique_ptr<liborc::OutputStream>(static_cast<liborc::OutputStream*>( + new MemoryOutputStream(DEFAULT_SMALL_MEM_STREAM_SIZE / 16))); + ARROW_EXPECT_OK(writer->Open(sharedPtrSchema, out_stream)); + ARROW_EXPECT_OK(writer->Write(table)); + auto output_mem_stream = static_cast<MemoryOutputStream*>(writer->ReleaseOutStream()); + std::shared_ptr<io::RandomAccessFile> in_stream( + new io::BufferReader(std::make_shared<Buffer>( + reinterpret_cast<const uint8_t*>(output_mem_stream->getData()), + static_cast<int64_t>(output_mem_stream->getLength())))); + + std::unique_ptr<adapters::orc::ORCFileReader> reader; + ASSERT_TRUE( + adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader).ok()); + std::shared_ptr<Table> outputTable; + ARROW_EXPECT_OK(reader->Read(&outputTable)); + EXPECT_EQ(outputTable->num_columns(), numCols); + EXPECT_EQ(outputTable->num_rows(), numRows); + EXPECT_TRUE(outputTable->Equals(*table)); +} +TEST(TestAdapterWriteGeneral, writeAllNulls) { + std::vector<std::shared_ptr<Field>> xFields{field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("decimal128nz", decimal(33, 4)), + field("decimal128z", decimal(35, 0)), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields); + + int64_t numRows = 10000; + int64_t numCols = xFields.size(); + + ArrayBuilderMatrix builders(numCols, ArrayBuilderVector(5, NULLPTR)); + + for (int i = 0; i < 5; i++) { + builders[0][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BooleanBuilder>()); + builders[1][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int8Builder>()); + builders[2][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int16Builder>()); + builders[3][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int32Builder>()); + builders[4][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Int64Builder>()); + builders[5][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(33, 4))); + builders[6][i] = std::static_pointer_cast<ArrayBuilder>( + std::make_shared<Decimal128Builder>(decimal(35, 0))); + builders[7][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<Date32Builder>()); + builders[8][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<TimestampBuilder>( + timestamp(TimeUnit::NANO), default_memory_pool())); + builders[9][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<StringBuilder>()); + builders[10][i] = + std::static_pointer_cast<ArrayBuilder>(std::make_shared<BinaryBuilder>()); + } + + for (int i = 0; i < numRows; i++) { + int chunk = i < (numRows / 2) ? 1 : 3; + for (int col = 0; col < numCols; col++) { + ARROW_EXPECT_OK(builders[col][chunk]->AppendNull()); + } + } + + ArrayMatrix arrays(numCols, ArrayVector(5, NULLPTR)); + ChunkedArrayVector cv; + cv.reserve(numCols); + + for (int col = 0; col < numCols; col++) { + for (int i = 0; i < 5; i++) { + ARROW_EXPECT_OK(builders[col][i]->Finish(&arrays[col][i])); + } + cv.push_back(std::make_shared<ChunkedArray>(arrays[col])); + } + + std::shared_ptr<Table> table = Table::Make(sharedPtrSchema, cv); Review comment: > I didn't see anything rust related in this PR so I removed the Rust label @alamb Thanks! There isn’t anything Parquet-related either. Can that be removed as well? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org