Repository: orc Updated Branches: refs/heads/master 533e0c4bc -> f31c80bd8
http://git-wip-us.apache.org/repos/asf/orc/blob/f31c80bd/c++/test/TestRleEncoder.cc ---------------------------------------------------------------------- diff --git a/c++/test/TestRleEncoder.cc b/c++/test/TestRleEncoder.cc new file mode 100644 index 0000000..74ce86d --- /dev/null +++ b/c++/test/TestRleEncoder.cc @@ -0,0 +1,286 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstdlib> + +#include "MemoryOutputStream.hh" +#include "RLEv1.hh" + +#include "wrap/orc-proto-wrapper.hh" +#include "wrap/gtest-wrapper.h" + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +#endif + +namespace orc { + + using ::testing::TestWithParam; + using ::testing::Values; + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + + class RleTest : public TestWithParam<bool> { + virtual void SetUp(); + + protected: + bool alignBitpacking = true; + std::unique_ptr<RleEncoder> getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned); + + void runExampleTest(int64_t* inputData, uint64_t inputLength, + unsigned char* expectedOutput, uint64_t outputLength); + + void runTest(RleVersion version, + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + bool isSigned, + uint64_t numNulls = 0); + }; + + void RleTest::SetUp() { + alignBitpacking = GetParam(); + } + + void generateData( + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + int64_t* data, + uint64_t numNulls = 0, + char* notNull = nullptr) { + if (numNulls != 0 && notNull != nullptr) { + memset(notNull, 1, numValues); + while (numNulls > 0) { + uint64_t pos = static_cast<uint64_t>(std::rand()) % numValues; + if (notNull[pos]) { + notNull[pos] = static_cast<char>(0); + --numNulls; + } + } + } + + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) + { + if (!random) { + data[i] = start + delta * static_cast<int64_t>(i); + } else { + data[i] = std::rand(); + } + } + } + } + + void decodeAndVerify( + RleVersion version, + const MemoryOutputStream& memStream, + int64_t * data, + uint64_t numValues, + const char* notNull, + bool isSinged) { + std::unique_ptr<RleDecoder> decoder = createRleDecoder( + std::unique_ptr<SeekableArrayInputStream>(new SeekableArrayInputStream( + memStream.getData(), + memStream.getLength())), + isSinged, version, *getDefaultPool()); + + int64_t* decodedData = new int64_t[numValues]; + decoder->next(decodedData, numValues, notNull); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); + } + } + + delete [] decodedData; + } + + std::unique_ptr<RleEncoder> RleTest::getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned) + { + MemoryPool * pool = getDefaultPool(); + + return createRleEncoder( + std::unique_ptr<BufferedOutputStream>( + new BufferedOutputStream(*pool, &memStream, 500 * 1024, 1024)), + isSigned, version, *pool, alignBitpacking); + } + + void RleTest::runExampleTest(int64_t* inputData, + uint64_t inputLength, + unsigned char* expectedOutput, + uint64_t outputLength) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr<RleEncoder> encoder = getEncoder(RleVersion_2, memStream, false); + + encoder->add(inputData, inputLength, nullptr); + encoder->flush(); + const char* output = memStream.getData(); + for(int i = 0; i < outputLength; i++) { + EXPECT_EQ(expectedOutput[i], static_cast<unsigned char>(output[i])); + } + } + + void RleTest::runTest(RleVersion version, + uint64_t numValues, + int64_t start, + int64_t delta, + bool random, + bool isSigned, + uint64_t numNulls) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr<RleEncoder> encoder = getEncoder(version, memStream, isSigned); + + char* notNull = numNulls == 0 ? nullptr : new char[numValues]; + int64_t* data = new int64_t[numValues]; + generateData(numValues, start, delta, random, data, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->flush(); + + decodeAndVerify(version, memStream, data, numValues, notNull, isSigned); + delete [] data; + delete [] notNull; + } + + TEST_P(RleTest, RleV1_delta_increasing_sequance_unsigned) { + runTest(RleVersion_1, 1024, 0, 1, false, false); + } + + TEST_P(RleTest, RleV1_delta_increasing_sequance_unsigned_null) { + runTest(RleVersion_1, 1024, 0, 1, false, false, 100); + } + + TEST_P(RleTest, RleV1_delta_decreasing_sequance_unsigned) { + runTest(RleVersion_1, 1024, 5000, -3, false, false); + } + + TEST_P(RleTest, RleV1_delta_decreasing_sequance_signed) { + runTest(RleVersion_1, 1024, 100, -3, false, true); + } + + TEST_P(RleTest, RleV1_delta_decreasing_sequance_signed_null) { + runTest(RleVersion_1, 1024, 100, -3, false, true, 500); + } + + TEST_P(RleTest, rRleV1_andom_sequance_signed) { + runTest(RleVersion_1, 1024, 0, 0, true, true); + } + + TEST_P(RleTest, RleV1_all_null) { + runTest(RleVersion_1, 1024, 100, -3, false, true, 1024); + } + + TEST_P(RleTest, RleV2_delta_increasing_sequance_unsigned) { + runTest(RleVersion_2, 1024, 0, 1, false, false); + } + + TEST_P(RleTest, RleV2_delta_increasing_sequance_unsigned_null) { + runTest(RleVersion_2, 1024, 0, 1, false, false, 100); + } + + TEST_P(RleTest, RleV2_delta_decreasing_sequance_unsigned) { + runTest(RleVersion_2, 1024, 5000, -3, false, false); + } + + TEST_P(RleTest, RleV2_delta_decreasing_sequance_signed) { + runTest(RleVersion_2, 1024, 100, -3, false, true); + } + + TEST_P(RleTest, RleV2_delta_decreasing_sequance_signed_null) { + runTest(RleVersion_2, 1024, 100, -3, false, true, 500); + } + + TEST_P(RleTest, RleV2_random_sequance_signed) { + runTest(RleVersion_2, 1024, 0, 0, true, true); + } + + TEST_P(RleTest, RleV2_all_null) { + runTest(RleVersion_2, 1024, 100, -3, false, true, 1024); + } + + TEST_P(RleTest, RleV2_delta_zero_unsigned) { + runTest(RleVersion_2, 1024, 123, 0, false, false); + } + + TEST_P(RleTest, RleV2_delta_zero_signed) { + runTest(RleVersion_2, 1024, 123, 0, false, true); + } + + TEST_P(RleTest, RleV2_short_repeat) { + runTest(RleVersion_2, 8, 123, 0, false, false); + } + + TEST_P(RleTest, RleV2_short_repeat_example) { + int64_t data[5] = {10000, 10000, 10000, 10000, 10000}; + unsigned char expectedEncoded[3] = {0x0a, 0x27, 0x10}; + runExampleTest(data, 5, expectedEncoded, 3); + } + + TEST_P(RleTest, RleV2_direct_example) { + int64_t data[4] = {23713, 43806, 57005, 48879}; + unsigned char expectedEncoded[10] = {0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef}; + runExampleTest(data, 4, expectedEncoded, 10); + } + + TEST_P(RleTest, RleV2_Patched_base_example) { + int64_t data[20] = {2030, 2000, 2020, 1000000, 2040, 2050, 2060, 2070, 2080, + 2090, 2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190}; + unsigned char expectedEncoded[28] = {0x8e, 0x13, 0x2b, 0x21, 0x07, 0xd0, 0x1e, 0x00, 0x14, + 0x70, 0x28, 0x32, 0x3c, 0x46, 0x50, 0x5a, 0x64, 0x6e, 0x78, 0x82, 0x8c, + 0x96, 0xa0, 0xaa, 0xb4, 0xbe, 0xfc, 0xe8}; + runExampleTest(data, 20, expectedEncoded, 28); + } + + TEST_P(RleTest, RleV2_delta_example) { + int64_t data[10] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29}; + unsigned char expectedEncoded[8] = {0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46}; + unsigned char unalignedExpectedEncoded[7] = {0xc4, 0x09, 0x02, 0x02, 0x4A, 0x28, 0xA6}; + if (alignBitpacking) + { + runExampleTest(data, 10, expectedEncoded, 8); + } + else + { + runExampleTest(data, 10, unalignedExpectedEncoded, 7); + } + } + + TEST_P(RleTest, RleV2_delta_example2) { + int64_t data[7] = {0, 10000, 10001, 10001, 10002, 10003, 10003}; + unsigned char expectedEncoded[8] = {0xc2, 0x06, 0x0, 0xa0, 0x9c, 0x01, 0x45, 0x0}; + runExampleTest(data, 7, expectedEncoded, 8); + } + + TEST_P(RleTest, RleV2_direct_repeat_example2) { + int64_t data[9] = {23713, 43806, 57005, 48879, 10000, 10000, 10000, 10000, 10000}; + unsigned char expectedEncoded[13] = {0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef, 0x0a, 0x27, 0x10}; + runExampleTest(data, 9, expectedEncoded, 13); + } + + INSTANTIATE_TEST_CASE_P(OrcTest, RleTest, Values(true, false)); +} http://git-wip-us.apache.org/repos/asf/orc/blob/f31c80bd/c++/test/TestWriter.cc ---------------------------------------------------------------------- diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 627c9c4..c3788d2 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -29,8 +29,15 @@ #include <ctime> #include <sstream> +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +#endif + namespace orc { + using ::testing::TestWithParam; + using ::testing::Values; + const int DEFAULT_MEM_STREAM_SIZE = 100 * 1024 * 1024; // 100M std::unique_ptr<Writer> createWriter( @@ -40,7 +47,7 @@ namespace orc { const Type& type, MemoryPool* memoryPool, OutputStream* stream, - FileVersion version = FileVersion(0, 11)){ + FileVersion version){ WriterOptions options; options.setStripeSize(stripeSize); options.setCompressionBlockSize(compresionblockSize); @@ -64,7 +71,21 @@ namespace orc { return reader->createRowReader(rowReaderOpts); } - TEST(Writer, writeEmptyFile) { + class WriterTest : public TestWithParam<FileVersion> { + // You can implement all the usual fixture class members here. + // To access the test parameter, call GetParam() from class + // TestWithParam<T>. + virtual void SetUp(); + + protected: + FileVersion fileVersion = FileVersion::v_0_11(); + }; + + void WriterTest::SetUp() { + fileVersion = GetParam(); + } + + TEST_P(WriterTest, writeEmptyFile) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); ORC_UNIQUE_PTR<Type> type(Type::buildTypeFromString("struct<col1:int>")); @@ -78,7 +99,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); writer->close(); std::unique_ptr<InputStream> inStream( @@ -87,8 +109,7 @@ namespace orc { pool, std::move(inStream)); std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); - EXPECT_EQ(FileVersion(0, 11), reader->getFormatVersion()); - EXPECT_EQ("0.11", reader->getFormatVersion().toString()); + EXPECT_EQ(fileVersion, reader->getFormatVersion()); EXPECT_EQ(WriterVersion_ORC_135, reader->getWriterVersion()); EXPECT_EQ(0, reader->getNumberOfRows()); @@ -100,7 +121,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - TEST(Writer, writeIntFileOneStripe) { + TEST_P(WriterTest, writeIntFileOneStripe) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); ORC_UNIQUE_PTR<Type> type(Type::buildTypeFromString("struct<col1:int>")); @@ -115,7 +136,7 @@ namespace orc { *type, pool, &memStream, - FileVersion(0, 11)); + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(1024); StructVectorBatch* structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -159,7 +180,7 @@ namespace orc { } } - TEST(Writer, writeIntFileMultipleStripes) { + TEST_P(WriterTest, writeIntFileMultipleStripes) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); ORC_UNIQUE_PTR<Type> type(Type::buildTypeFromString("struct<col1:int>")); @@ -173,7 +194,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(65535); StructVectorBatch* structBatch = dynamic_cast<StructVectorBatch*>(batch.get()); @@ -214,7 +236,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - TEST(Writer, writeStringAndBinaryColumn) { + TEST_P(WriterTest, writeStringAndBinaryColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -231,7 +253,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(65535); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -286,7 +309,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - TEST(Writer, writeFloatAndDoubleColumn) { + TEST_P(WriterTest, writeFloatAndDoubleColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -306,7 +329,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -348,7 +372,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - TEST(Writer, writeShortIntLong) { + TEST_P(WriterTest, writeShortIntLong) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -363,7 +387,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -407,7 +432,7 @@ namespace orc { } } - TEST(Writer, writeTinyint) { + TEST_P(WriterTest, writeTinyint) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -422,7 +447,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -454,7 +480,7 @@ namespace orc { } } - TEST(Writer, writeBooleanColumn) { + TEST_P(WriterTest, writeBooleanColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:boolean>")); @@ -468,7 +494,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -500,7 +527,7 @@ namespace orc { } } - TEST(Writer, writeDate) { + TEST_P(WriterTest, writeDate) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:date>")); @@ -514,7 +541,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = @@ -547,7 +575,7 @@ namespace orc { } } - TEST(Writer, writeTimestamp) { + TEST_P(WriterTest, writeTimestamp) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:timestamp>")); @@ -561,7 +589,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -598,7 +627,7 @@ namespace orc { } } - TEST(Writer, writeCharAndVarcharColumn) { + TEST_P(WriterTest, writeCharAndVarcharColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -616,7 +645,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = @@ -688,7 +718,7 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } - TEST(Writer, writeDecimal64Column) { + TEST_P(WriterTest, writeDecimal64Column) { const uint64_t maxPrecision = 18; MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); @@ -704,7 +734,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -771,7 +802,7 @@ namespace orc { } } - TEST(Writer, writeDecimal128Column) { + TEST_P(WriterTest, writeDecimal128Column) { const uint64_t maxPrecision = 38; MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); @@ -787,7 +818,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -864,7 +896,7 @@ namespace orc { } } - TEST(Writer, writeListColumn) { + TEST_P(WriterTest, writeListColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); @@ -882,7 +914,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount * maxListLength); @@ -935,7 +968,7 @@ namespace orc { } } - TEST(Writer, writeMapColumn) { + TEST_P(WriterTest, writeMapColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type( @@ -950,7 +983,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount * maxListLength); StructVectorBatch * structBatch = @@ -1027,7 +1061,7 @@ namespace orc { } } - TEST(Writer, writeUnionColumn) { + TEST_P(WriterTest, writeUnionColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool * pool = getDefaultPool(); std::unique_ptr<Type> type(Type::buildTypeFromString( @@ -1042,7 +1076,8 @@ namespace orc { CompressionKind_ZLIB, *type, pool, - &memStream); + &memStream, + fileVersion); std::unique_ptr<ColumnVectorBatch> batch = writer->createRowBatch(rowCount); StructVectorBatch * structBatch = dynamic_cast<StructVectorBatch *>(batch.get()); @@ -1134,4 +1169,6 @@ namespace orc { } } } + + INSTANTIATE_TEST_CASE_P(OrcTest, WriterTest, Values(FileVersion::v_0_11(), FileVersion::v_0_12())); }
