This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.8
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.8 by this push:
new 7533a89f7 ORC-1525: Fix bad read in `RleDecoderV2::readByte`
7533a89f7 is described below
commit 7533a89f7d306a3d89b7eee9ed2f05932e67cadd
Author: hoffermei <[email protected]>
AuthorDate: Thu Nov 2 14:14:20 2023 -0700
ORC-1525: Fix bad read in `RleDecoderV2::readByte`
### What changes were proposed in this pull request?
This PR aims to fix #1640 by resetting `BooleanRleEncoderImpl::current` and
`BooleanRleEncoderImpl::bitsRemained` when suppress
### Why are the changes needed?
As #1640 suppress no null present stream leaves dirty data of
BooleanRleEncoderImpl::current and BooleanRleEncoderImpl::bitsRemained, which
will be flush to next stripe's present stream if it has some null values.
### How was this patch tested?
I hava add a test testSuppressPresentStreamInPreStripe, which will
construct a orc file with two stripe, the first stripe has no null value and
seconds stripe has some null values. The constructed orc file writer have some
dirty data in BooleanRleEncoderImpl for present stream. In the test I have add
check for read ok and read result is same as write.
Closes #1640 .
Closes #1645 from wgtmac/branch-1.8.
Authored-by: hoffermei <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/ByteRLE.cc | 8 ++++++
c++/test/TestWriter.cc | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 86 insertions(+)
diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc
index cdaed5586..2512f9041 100644
--- a/c++/src/ByteRLE.cc
+++ b/c++/src/ByteRLE.cc
@@ -248,6 +248,8 @@ namespace orc {
virtual void recordPosition(PositionRecorder* recorder) const override;
+ virtual void suppress() override;
+
private:
int bitsRemained;
char current;
@@ -304,6 +306,12 @@ namespace orc {
recorder->add(static_cast<uint64_t>(8 - bitsRemained));
}
+ void BooleanRleEncoderImpl::suppress() {
+ ByteRleEncoderImpl::suppress();
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+ }
+
std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
(std::unique_ptr<BufferedOutputStream> output)
{
BooleanRleEncoderImpl* encoder =
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index ee8270805..3554c9047 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -2044,5 +2044,83 @@ namespace orc {
testSuppressPresentStream(CompressionKind_SNAPPY);
}
+ // first stripe has no null value and second stripe has null value.
+ // make sure stripes do not have dirty data in the present streams.
+ TEST_P(WriterTest, testSuppressPresentStreamInPreStripe) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* pool = getDefaultPool();
+
+ // [1-998000): notNull, value is equal to index
+ // [998000-999000): null
+ // [999000-1000000]: notNoll, value is equal to index
+ size_t rowCount = 1000000;
+ size_t nullBeginCount = 998000;
+ size_t nullEndCount = 999000;
+ size_t batchSize = 5;
+ {
+ auto type =
std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int>"));
+ WriterOptions options;
+ options.setStripeSize(16 * 1024)
+ .setCompressionBlockSize(1024)
+ .setCompression(CompressionKind_NONE)
+ .setMemoryPool(pool)
+ .setRowIndexStride(1000);
+
+ auto writer = createWriter(*type, &memStream, options);
+
+ uint64_t batchCount = rowCount / batchSize;
+ size_t rowsWrite = 0;
+ for (uint64_t batchIdx = 0; batchIdx < batchCount; batchIdx++) {
+ auto batch = writer->createRowBatch(batchSize);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+ auto& longBatch =
dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+ structBatch.numElements = batchSize;
+ longBatch.numElements = batchSize;
+ longBatch.hasNulls = false;
+ for (uint64_t row = 0; row < batchSize; ++row) {
+ size_t rowIndex = rowsWrite + row + 1;
+ if (rowIndex < nullBeginCount || rowIndex >= nullEndCount) {
+ longBatch.data[row] = static_cast<int64_t>(rowIndex);
+ } else {
+ longBatch.notNull[row] = 0;
+ longBatch.hasNulls = true;
+ }
+ }
+
+ writer->add(*batch);
+ rowsWrite += batch->numElements;
+ }
+ writer->close();
+ }
+ // read file & check the column value correct
+ {
+ std::unique_ptr<MemoryInputStream> inStream(new MemoryInputStream(
+ memStream.getData(), memStream.getLength()));
+ ReaderOptions readerOptions;
+ readerOptions.setMemoryPool(*pool);
+ std::unique_ptr<Reader> reader = createReader(std::move(inStream),
readerOptions);
+ EXPECT_EQ(reader->getNumberOfStripes(), 2);
+ EXPECT_EQ(rowCount, reader->getNumberOfRows());
+ std::unique_ptr<RowReader> rowReader = createRowReader(reader.get());
+ size_t rowsRead = 0;
+ while (rowsRead < rowCount) {
+ auto batch = rowReader->createRowBatch(1000);
+ EXPECT_TRUE(rowReader->next(*batch));
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+ auto& longBatch =
dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]);
+ for (size_t i = 0; i < batch->numElements; ++i) {
+ size_t rowIndex = rowsRead + i + 1;
+ if (rowIndex < nullBeginCount || rowIndex >= nullEndCount) {
+ EXPECT_TRUE(longBatch.notNull[i]);
+ EXPECT_EQ(longBatch.data[i], static_cast<int64_t>(rowIndex));
+ } else {
+ EXPECT_FALSE(longBatch.notNull[i]);
+ }
+ }
+ rowsRead += batch->numElements;
+ }
+ }
+ }
+
INSTANTIATE_TEST_CASE_P(OrcTest, WriterTest, Values(FileVersion::v_0_11(),
FileVersion::v_0_12(), FileVersion::UNSTABLE_PRE_2_0()));
}