This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 070c1a5dc ORC-1399: [C++] Fix boolean type with useTightNumericVector 
enabled
070c1a5dc is described below

commit 070c1a5dce3fd7f6b6b8938cc3d872a7b67b1b28
Author: ffacs <[email protected]>
AuthorDate: Mon Mar 27 23:02:12 2023 +0800

    ORC-1399: [C++] Fix boolean type with useTightNumericVector enabled
    
    ### What changes were proposed in this pull request?
    Fix using ByteVectorBatch for BOOLEAN type with useTightNumericVector 
enabled.
    
    ### Why are the changes needed?
    If a ByteVectorBatch is returned, BooleanColumnReader/Writer would crash 
because they only accept LongVectorBatch.
    
    This closes #1449
---
 c++/src/ColumnReader.cc | 33 +++++++++++++++++++++++----------
 c++/src/ColumnWriter.cc | 42 +++++++++++++++++++++++++++++-------------
 c++/src/TypeImpl.cc     |  6 +++++-
 c++/test/TestWriter.cc  | 12 ++++++++++--
 4 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 179f86ca8..2a72c80ee 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -133,6 +133,7 @@ namespace orc {
     }
   }
 
+  template <typename BatchType>
   class BooleanColumnReader : public ColumnReader {
    private:
     std::unique_ptr<orc::ByteRleDecoder> rle;
@@ -148,7 +149,8 @@ namespace orc {
     void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& 
positions) override;
   };
 
-  BooleanColumnReader::BooleanColumnReader(const Type& type, StripeStreams& 
stripe)
+  template <typename BatchType>
+  BooleanColumnReader<BatchType>::BooleanColumnReader(const Type& type, 
StripeStreams& stripe)
       : ColumnReader(type, stripe) {
     std::unique_ptr<SeekableInputStream> stream =
         stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
@@ -156,27 +158,33 @@ namespace orc {
     rle = createBooleanRleDecoder(std::move(stream), metrics);
   }
 
-  BooleanColumnReader::~BooleanColumnReader() {
+  template <typename BatchType>
+  BooleanColumnReader<BatchType>::~BooleanColumnReader() {
     // PASS
   }
 
-  uint64_t BooleanColumnReader::skip(uint64_t numValues) {
+  template <typename BatchType>
+  uint64_t BooleanColumnReader<BatchType>::skip(uint64_t numValues) {
     numValues = ColumnReader::skip(numValues);
     rle->skip(numValues);
     return numValues;
   }
 
-  void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t 
numValues, char* notNull) {
+  template <typename BatchType>
+  void BooleanColumnReader<BatchType>::next(ColumnVectorBatch& rowBatch, 
uint64_t numValues,
+                                            char* notNull) {
     ColumnReader::next(rowBatch, numValues, notNull);
-    // Since the byte rle places the output in a char* instead of long*,
-    // we cheat here and use the long* and then expand it in a second pass.
-    int64_t* ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+    // Since the byte rle places the output in a char* and BatchType here may 
be
+    // LongVectorBatch with long*. We cheat here in that case and use the long*
+    // and then expand it in a second pass..
+    auto* ptr = dynamic_cast<BatchType&>(rowBatch).data.data();
     rle->next(reinterpret_cast<char*>(ptr), numValues,
               rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
     expandBytesToIntegers(ptr, numValues);
   }
 
-  void BooleanColumnReader::seekToRowGroup(
+  template <typename BatchType>
+  void BooleanColumnReader<BatchType>::seekToRowGroup(
       std::unordered_map<uint64_t, PositionProvider>& positions) {
     ColumnReader::seekToRowGroup(positions);
     rle->seek(positions.at(columnId));
@@ -1721,8 +1729,13 @@ namespace orc {
             throw NotImplementedYet("buildReader unhandled string encoding");
         }
 
-      case BOOLEAN:
-        return std::make_unique<BooleanColumnReader>(type, stripe);
+      case BOOLEAN: {
+        if (useTightNumericVector) {
+          return std::make_unique<BooleanColumnReader<ByteVectorBatch>>(type, 
stripe);
+        } else {
+          return std::make_unique<BooleanColumnReader<LongVectorBatch>>(type, 
stripe);
+        }
+      }
 
       case BYTE:
         if (useTightNumericVector) {
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index 2565cbe1c..0c72e1ccf 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -635,6 +635,7 @@ namespace orc {
     byteRleEncoder->recordPosition(rowIndexPosition.get());
   }
 
+  template <typename BatchType>
   class BooleanColumnWriter : public ColumnWriter {
    public:
     BooleanColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -655,8 +656,10 @@ namespace orc {
     std::unique_ptr<ByteRleEncoder> rleEncoder;
   };
 
-  BooleanColumnWriter::BooleanColumnWriter(const Type& type, const 
StreamsFactory& factory,
-                                           const WriterOptions& options)
+  template <typename BatchType>
+  BooleanColumnWriter<BatchType>::BooleanColumnWriter(const Type& type,
+                                                      const StreamsFactory& 
factory,
+                                                      const WriterOptions& 
options)
       : ColumnWriter(type, factory, options) {
     std::unique_ptr<BufferedOutputStream> dataStream =
         factory.createStream(proto::Stream_Kind_DATA);
@@ -667,11 +670,14 @@ namespace orc {
     }
   }
 
-  void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, 
uint64_t numValues,
-                                const char* incomingMask) {
-    LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
+  template <typename BatchType>
+  void BooleanColumnWriter<BatchType>::add(ColumnVectorBatch& rowBatch, 
uint64_t offset,
+                                           uint64_t numValues, const char* 
incomingMask) {
+    BatchType* byteBatch = dynamic_cast<BatchType*>(&rowBatch);
     if (byteBatch == nullptr) {
-      throw InvalidArgument("Failed to cast to LongVectorBatch");
+      std::stringstream ss;
+      ss << "Failed to cast to " << typeid(BatchType).name();
+      throw InvalidArgument(ss.str());
     }
     BooleanColumnStatisticsImpl* boolStats =
         dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -681,7 +687,7 @@ namespace orc {
 
     ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
 
-    int64_t* data = byteBatch->data.data() + offset;
+    auto* data = byteBatch->data.data() + offset;
     const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + 
offset : nullptr;
 
     char* byteData = reinterpret_cast<char*>(data);
@@ -706,7 +712,8 @@ namespace orc {
     }
   }
 
-  void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) {
+  template <typename BatchType>
+  void BooleanColumnWriter<BatchType>::flush(std::vector<proto::Stream>& 
streams) {
     ColumnWriter::flush(streams);
 
     proto::Stream stream;
@@ -716,13 +723,16 @@ namespace orc {
     streams.push_back(stream);
   }
 
-  uint64_t BooleanColumnWriter::getEstimatedSize() const {
+  template <typename BatchType>
+  uint64_t BooleanColumnWriter<BatchType>::getEstimatedSize() const {
     uint64_t size = ColumnWriter::getEstimatedSize();
     size += rleEncoder->getBufferSize();
     return size;
   }
 
-  void 
BooleanColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& 
encodings) const {
+  template <typename BatchType>
+  void BooleanColumnWriter<BatchType>::getColumnEncoding(
+      std::vector<proto::ColumnEncoding>& encodings) const {
     proto::ColumnEncoding encoding;
     encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
     encoding.set_dictionarysize(0);
@@ -732,7 +742,8 @@ namespace orc {
     encodings.push_back(encoding);
   }
 
-  void BooleanColumnWriter::recordPosition() const {
+  template <typename BatchType>
+  void BooleanColumnWriter<BatchType>::recordPosition() const {
     ColumnWriter::recordPosition();
     rleEncoder->recordPosition(rowIndexPosition.get());
   }
@@ -2824,8 +2835,13 @@ namespace orc {
           return std::make_unique<ByteColumnWriter<ByteVectorBatch>>(type, 
factory, options);
         }
         return std::make_unique<ByteColumnWriter<LongVectorBatch>>(type, 
factory, options);
-      case BOOLEAN:
-        return std::make_unique<BooleanColumnWriter>(type, factory, options);
+      case BOOLEAN: {
+        if (options.getUseTightNumericVector()) {
+          return std::make_unique<BooleanColumnWriter<ByteVectorBatch>>(type, 
factory, options);
+        } else {
+          return std::make_unique<BooleanColumnWriter<LongVectorBatch>>(type, 
factory, options);
+        }
+      }
       case DOUBLE:
         return std::make_unique<FloatingColumnWriter<double, 
DoubleVectorBatch>>(type, factory,
                                                                                
  options, false);
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index 4e0f7f268..0075d0478 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -286,7 +286,11 @@ namespace orc {
                                                               MemoryPool& 
memoryPool, bool encoded,
                                                               bool 
useTightNumericVector) const {
     switch (static_cast<int64_t>(kind)) {
-      case BOOLEAN:
+      case BOOLEAN: {
+        if (useTightNumericVector) {
+          return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
+        }
+      }
       case BYTE: {
         if (useTightNumericVector) {
           return std::make_unique<ByteVectorBatch>(capacity, memoryPool);
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index b2bee3ba1..c8c3ca139 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -1897,8 +1897,9 @@ namespace orc {
   TEST_P(WriterTest, testWriteFixedWidthNumericVectorBatch) {
     MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
     MemoryPool* pool = getDefaultPool();
-    std::unique_ptr<Type> type(Type::buildTypeFromString(
-        
"struct<col1:double,col2:float,col3:int,col4:smallint,col5:tinyint,col6:bigint>"));
+    std::unique_ptr<Type> type(
+        
Type::buildTypeFromString("struct<col1:double,col2:float,col3:int,col4:smallint,col5:"
+                                  "tinyint,col6:bigint,col7:boolean>"));
 
     uint64_t stripeSize = 16 * 1024;
     uint64_t compressionBlockSize = 1024;
@@ -1921,6 +1922,7 @@ namespace orc {
     ShortVectorBatch* shortBatch = 
dynamic_cast<ShortVectorBatch*>(structBatch->fields[3]);
     ByteVectorBatch* byteBatch = 
dynamic_cast<ByteVectorBatch*>(structBatch->fields[4]);
     LongVectorBatch* longBatch = 
dynamic_cast<LongVectorBatch*>(structBatch->fields[5]);
+    ByteVectorBatch* boolBatch = 
dynamic_cast<ByteVectorBatch*>(structBatch->fields[6]);
     structBatch->resize(rowCount);
     doubleBatch->resize(rowCount);
     floatBatch->resize(rowCount);
@@ -1928,6 +1930,7 @@ namespace orc {
     shortBatch->resize(rowCount);
     byteBatch->resize(rowCount);
     longBatch->resize(rowCount);
+    boolBatch->resize(rowCount);
 
     for (uint64_t i = 0; i < rowCount; ++i) {
       structBatch->notNull[i] = 1;
@@ -1937,6 +1940,7 @@ namespace orc {
       shortBatch->notNull[i] = 1;
       byteBatch->notNull[i] = 1;
       longBatch->notNull[i] = 1;
+      boolBatch->notNull[i] = 1;
 
       doubleBatch->data[i] = data[i];
       floatBatch->data[i] = static_cast<float>(data[i]);
@@ -1944,6 +1948,7 @@ namespace orc {
       shortBatch->data[i] = static_cast<int16_t>(i);
       byteBatch->data[i] = static_cast<int8_t>(i);
       longBatch->data[i] = static_cast<int64_t>(i);
+      boolBatch->data[i] = static_cast<bool>((i % 17) % 2);
     }
 
     structBatch->numElements = rowCount;
@@ -1953,6 +1958,7 @@ namespace orc {
     shortBatch->numElements = rowCount;
     byteBatch->numElements = rowCount;
     longBatch->numElements = rowCount;
+    boolBatch->numElements = rowCount;
 
     writer->add(*batch);
     writer->close();
@@ -1974,6 +1980,7 @@ namespace orc {
     shortBatch = dynamic_cast<ShortVectorBatch*>(structBatch->fields[3]);
     byteBatch = dynamic_cast<ByteVectorBatch*>(structBatch->fields[4]);
     longBatch = dynamic_cast<LongVectorBatch*>(structBatch->fields[5]);
+    boolBatch = dynamic_cast<ByteVectorBatch*>(structBatch->fields[6]);
     for (uint64_t i = 0; i < rowCount; ++i) {
       EXPECT_TRUE(std::abs(data[i] - doubleBatch->data[i]) < 0.000001);
       EXPECT_TRUE(std::abs(static_cast<float>(data[i]) - 
static_cast<float>(floatBatch->data[i])) <
@@ -1982,6 +1989,7 @@ namespace orc {
       EXPECT_EQ(shortBatch->data[i], static_cast<int16_t>(i));
       EXPECT_EQ(byteBatch->data[i], static_cast<int8_t>(i));
       EXPECT_EQ(longBatch->data[i], static_cast<int64_t>(i));
+      EXPECT_EQ(boolBatch->data[i], static_cast<bool>((i % 17) % 2));
     }
     EXPECT_FALSE(rowReader->next(*batch));
   }

Reply via email to