This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 070c1a5dc ORC-1399: [C++] Fix boolean type with useTightNumericVector
enabled
070c1a5dc is described below
commit 070c1a5dce3fd7f6b6b8938cc3d872a7b67b1b28
Author: ffacs <[email protected]>
AuthorDate: Mon Mar 27 23:02:12 2023 +0800
ORC-1399: [C++] Fix boolean type with useTightNumericVector enabled
### What changes were proposed in this pull request?
Fix using ByteVectorBatch for BOOLEAN type with useTightNumericVector
enabled.
### Why are the changes needed?
If a ByteVectorBatch is returned, BooleanColumnReader/Writer would crash
because they only accept LongVectorBatch.
This closes #1449
---
c++/src/ColumnReader.cc | 33 +++++++++++++++++++++++----------
c++/src/ColumnWriter.cc | 42 +++++++++++++++++++++++++++++-------------
c++/src/TypeImpl.cc | 6 +++++-
c++/test/TestWriter.cc | 12 ++++++++++--
4 files changed, 67 insertions(+), 26 deletions(-)
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 179f86ca8..2a72c80ee 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -133,6 +133,7 @@ namespace orc {
}
}
+ template <typename BatchType>
class BooleanColumnReader : public ColumnReader {
private:
std::unique_ptr<orc::ByteRleDecoder> rle;
@@ -148,7 +149,8 @@ namespace orc {
void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>&
positions) override;
};
- BooleanColumnReader::BooleanColumnReader(const Type& type, StripeStreams&
stripe)
+ template <typename BatchType>
+ BooleanColumnReader<BatchType>::BooleanColumnReader(const Type& type,
StripeStreams& stripe)
: ColumnReader(type, stripe) {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
@@ -156,27 +158,33 @@ namespace orc {
rle = createBooleanRleDecoder(std::move(stream), metrics);
}
- BooleanColumnReader::~BooleanColumnReader() {
+ template <typename BatchType>
+ BooleanColumnReader<BatchType>::~BooleanColumnReader() {
// PASS
}
- uint64_t BooleanColumnReader::skip(uint64_t numValues) {
+ template <typename BatchType>
+ uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
rle->skip(numValues);
return numValues;
}
- void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t
numValues, char* notNull) {
+ template <typename BatchType>
+ void BooleanColumnReader<BatchType>::next(ColumnVectorBatch& rowBatch,
uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- // Since the byte rle places the output in a char* instead of long*,
- // we cheat here and use the long* and then expand it in a second pass.
- int64_t* ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+ // Since the byte rle places the output in a char* and BatchType here may
be
+ // LongVectorBatch with long*. We cheat here in that case and use the long*
+ // and then expand it in a second pass..
+ auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
rle->next(reinterpret_cast<char*>(ptr), numValues,
rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
expandBytesToIntegers(ptr, numValues);
}
- void BooleanColumnReader::seekToRowGroup(
+ template <typename BatchType>
+ void BooleanColumnReader<BatchType>::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
@@ -1721,8 +1729,13 @@ namespace orc {
throw NotImplementedYet("buildReader unhandled string encoding");
}
- case BOOLEAN:
- return std::make_unique<BooleanColumnReader>(type, stripe);
+ case BOOLEAN: {
+ if (useTightNumericVector) {
+ return std::make_unique<BooleanColumnReader<ByteVectorBatch>>(type,
stripe);
+ } else {
+ return std::make_unique<BooleanColumnReader<LongVectorBatch>>(type,
stripe);
+ }
+ }
case BYTE:
if (useTightNumericVector) {
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 2565cbe1c..0c72e1ccf 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -635,6 +635,7 @@ namespace orc {
byteRleEncoder->recordPosition(rowIndexPosition.get());
}
+ template <typename BatchType>
class BooleanColumnWriter : public ColumnWriter {
public:
BooleanColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -655,8 +656,10 @@ namespace orc {
std::unique_ptr<ByteRleEncoder> rleEncoder;
};
- BooleanColumnWriter::BooleanColumnWriter(const Type& type, const
StreamsFactory& factory,
- const WriterOptions& options)
+ template <typename BatchType>
+ BooleanColumnWriter<BatchType>::BooleanColumnWriter(const Type& type,
+ const StreamsFactory&
factory,
+ const WriterOptions&
options)
: ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
@@ -667,11 +670,14 @@ namespace orc {
}
}
- void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset,
uint64_t numValues,
- const char* incomingMask) {
- LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
+ uint64_t numValues, const char*
incomingMask) {
+ BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch);
if (byteBatch == nullptr) {
- throw InvalidArgument("Failed to cast to LongVectorBatch");
+ std::stringstream ss;
+ ss << "Failed to cast to " << typeid(BatchType).name();
+ throw InvalidArgument(ss.str());
}
BooleanColumnStatisticsImpl* boolStats =
dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -681,7 +687,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- int64_t* data = byteBatch->data.data() + offset;
+ auto* data = byteBatch->data.data() + offset;
const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() +
offset : nullptr;
char* byteData = reinterpret_cast<char*>(data);
@@ -706,7 +712,8 @@ namespace orc {
}
}
- void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::flush(std::vector<proto::Stream>&
streams) {
ColumnWriter::flush(streams);
proto::Stream stream;
@@ -716,13 +723,16 @@ namespace orc {
streams.push_back(stream);
}
- uint64_t BooleanColumnWriter::getEstimatedSize() const {
+ template <typename BatchType>
+ uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const {
uint64_t size = ColumnWriter::getEstimatedSize();
size += rleEncoder->getBufferSize();
return size;
}
- void
BooleanColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>&
encodings) const {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::getColumnEncoding(
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -732,7 +742,8 @@ namespace orc {
encodings.push_back(encoding);
}
- void BooleanColumnWriter::recordPosition() const {
+ template <typename BatchType>
+ void BooleanColumnWriter<BatchType>::recordPosition() const {
ColumnWriter::recordPosition();
rleEncoder->recordPosition(rowIndexPosition.get());
}
@@ -2824,8 +2835,13 @@ namespace orc {
return std::make_unique<ByteColumnWriter<ByteVectorBatch>>(type,
factory, options);
}
return std::make_unique<ByteColumnWriter<LongVectorBatch>>(type,
factory, options);
- case BOOLEAN:
- return std::make_unique<BooleanColumnWriter>(type, factory, options);
+ case BOOLEAN: {
+ if (options.getUseTightNumericVector()) {
+ return std::make_unique<BooleanColumnWriter<ByteVectorBatch>>(type,
factory, options);
+ } else {
+ return std::make_unique<BooleanColumnWriter<LongVectorBatch>>(type,
factory, options);
+ }
+ }
case DOUBLE:
return std::make_unique<FloatingColumnWriter<double,
DoubleVectorBatch>>(type, factory,
options, false);
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index 4e0f7f268..0075d0478 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -286,7 +286,11 @@ namespace orc {
MemoryPool&
memoryPool, bool encoded,
bool
useTightNumericVector) const {
switch (static_cast<int64_t>(kind)) {
- case BOOLEAN:
+ case BOOLEAN: {
+ if (useTightNumericVector) {
+ return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
+ }
+ }
case BYTE: {
if (useTightNumericVector) {
return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index b2bee3ba1..c8c3ca139 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -1897,8 +1897,9 @@ namespace orc {
TEST_P(WriterTest, testWriteFixedWidthNumericVectorBatch) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
- std::unique_ptr<Type> type(Type::buildTypeFromString(
-
"struct<col1:double,col2:float,col3:int,col4:smallint,col5:tinyint,col6:bigint>"));
+ std::unique_ptr<Type> type(
+
Type::buildTypeFromString("struct<col1:double,col2:float,col3:int,col4:smallint,col5:"
+ "tinyint,col6:bigint,col7:boolean>"));
uint64_t stripeSize = 16 * 1024;
uint64_t compressionBlockSize = 1024;
@@ -1921,6 +1922,7 @@ namespace orc {
ShortVectorBatch* shortBatch =
dynamic_cast<ShortVectorBatch*>(structBatch->fields[3]);
ByteVectorBatch* byteBatch =
dynamic_cast<ByteVectorBatch*>(structBatch->fields[4]);
LongVectorBatch* longBatch =
dynamic_cast<LongVectorBatch*>(structBatch->fields[5]);
+ ByteVectorBatch* boolBatch =
dynamic_cast<ByteVectorBatch*>(structBatch->fields[6]);
structBatch->resize(rowCount);
doubleBatch->resize(rowCount);
floatBatch->resize(rowCount);
@@ -1928,6 +1930,7 @@ namespace orc {
shortBatch->resize(rowCount);
byteBatch->resize(rowCount);
longBatch->resize(rowCount);
+ boolBatch->resize(rowCount);
for (uint64_t i = 0; i < rowCount; ++i) {
structBatch->notNull[i] = 1;
@@ -1937,6 +1940,7 @@ namespace orc {
shortBatch->notNull[i] = 1;
byteBatch->notNull[i] = 1;
longBatch->notNull[i] = 1;
+ boolBatch->notNull[i] = 1;
doubleBatch->data[i] = data[i];
floatBatch->data[i] = static_cast<float>(data[i]);
@@ -1944,6 +1948,7 @@ namespace orc {
shortBatch->data[i] = static_cast<int16_t>(i);
byteBatch->data[i] = static_cast<int8_t>(i);
longBatch->data[i] = static_cast<int64_t>(i);
+ boolBatch->data[i] = static_cast<bool>((i % 17) % 2);
}
structBatch->numElements = rowCount;
@@ -1953,6 +1958,7 @@ namespace orc {
shortBatch->numElements = rowCount;
byteBatch->numElements = rowCount;
longBatch->numElements = rowCount;
+ boolBatch->numElements = rowCount;
writer->add(*batch);
writer->close();
@@ -1974,6 +1980,7 @@ namespace orc {
shortBatch = dynamic_cast<ShortVectorBatch*>(structBatch->fields[3]);
byteBatch = dynamic_cast<ByteVectorBatch*>(structBatch->fields[4]);
longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[5]);
+ boolBatch = dynamic_cast<ByteVectorBatch*>(structBatch->fields[6]);
for (uint64_t i = 0; i < rowCount; ++i) {
EXPECT_TRUE(std::abs(data[i] - doubleBatch->data[i]) < 0.000001);
EXPECT_TRUE(std::abs(static_cast<float>(data[i]) -
static_cast<float>(floatBatch->data[i])) <
@@ -1982,6 +1989,7 @@ namespace orc {
EXPECT_EQ(shortBatch->data[i], static_cast<int16_t>(i));
EXPECT_EQ(byteBatch->data[i], static_cast<int8_t>(i));
EXPECT_EQ(longBatch->data[i], static_cast<int64_t>(i));
+ EXPECT_EQ(boolBatch->data[i], static_cast<bool>((i % 17) % 2));
}
EXPECT_FALSE(rowReader->next(*batch));
}