ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1221855888


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,327 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override;
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t 
numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch, 
uint64_t numValues,
+                                                char* notNull) {
+    ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+    // cache converted string in the buffer
+    auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+    // contact string values to blob buffer of vector batch
+    auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+    dstBatch.blob.resize(totalLength);
+    char* blob = dstBatch.blob.data();
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        const auto size = strBuffer[i].size();
+        ::memcpy(blob, strBuffer[i].c_str(), size);
+        dstBatch.data[i] = blob;
+        dstBatch.length[i] = static_cast<int32_t>(size);
+        blob += size;
+      }
+    }
+    strBuffer.clear();
+  }
+
+  class BooleanToStringVariantColumnReader : public 
ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, 
_throwOnOverflow) {
+      trueValue = "TRUE";
+      falseValue = "FALSE";
+      if (readType.getKind() != STRING) {
+        if (readType.getMaximumLength() < 5) {
+          throw SchemaEvolutionError("Invalid maximum length for boolean type: 
" +
+                                     
std::to_string(readType.getMaximumLength()));
+        }
+        if (readType.getKind() == CHAR) {
+          trueValue.resize(readType.getMaximumLength(), ' ');
+          falseValue.resize(readType.getMaximumLength(), ' ');
+        }
+      }
+    }
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) 
override;
+
+    private:
+      std::string trueValue;
+      std::string falseValue;
+  };
+
+  size_t 
BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& 
rowBatch,
+                                                                uint64_t 
numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const 
BooleanVectorBatch*>(data.get());
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public 
ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, 
_throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) 
override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, 
i, throwOnOverflow);
+          } else {
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch, 
i, throwOnOverflow);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+            size += strBuffer[i].size();
+          }
+        }
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool 
isFLoatingFileType>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, 
StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+      scaleMultiplier = 1;
+      bool overflow = false;
+      upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+      for (int i = 0; i < scale; i++) {
+        scaleMultiplier *= 10;
+      }
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const 
FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFLoatingFileType) {

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to