timestamp

via GitHub Mon, 29 May 2023 09:49:41 -0700


ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209461269



##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
     }
   };
 
+  class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      // cache converted string in the buffer
+      auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+      // contact string values to blob buffer of vector batch
+      auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+      dstBatch.blob.resize(totalLength);
+      char* blob = dstBatch.blob.data();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          const auto size = strBuffer[i].size();
+          ::memcpy(blob, strBuffer[i].c_str(), size);
+          dstBatch.data[i] = blob;
+          dstBatch.length[i] = static_cast<int32_t>(size);
+          blob += size;
+        }
+      }
+      strBuffer.clear();
+    }
+
+    virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t 
numValues) = 0;
+
+   protected:
+    std::vector<std::string> strBuffer;
+  };
+
+  class BooleanToStringVariantColumnReader : public 
ConvertToStringVariantColumnReader {
+   public:
+    BooleanToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, 
_throwOnOverflow) {}
+
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) 
override;
+  };
+
+  size_t 
BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch& 
rowBatch,
+                                                                uint64_t 
numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const 
BooleanVectorBatch*>(data.get());
+    std::string trueValue = "TRUE";
+    std::string falseValue = "FALSE";
+    if (readType.getKind() == CHAR) {
+      trueValue.resize(readType.getMaximumLength(), ' ');
+      falseValue.resize(readType.getMaximumLength(), ' ');
+    } else if (readType.getKind() == VARCHAR) {
+      trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4), 
readType.getMaximumLength()));
+      falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5), 
readType.getMaximumLength()));
+    }
+    // cast the bool value to string and truncate to the max length
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+        strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+        size += strBuffer[i].size();
+      }
+    }
+    return size;
+  }
+
+  template <typename FileTypeBatch>
+  class NumericToStringVariantColumnReader : public 
ConvertToStringVariantColumnReader {
+   public:
+    NumericToStringVariantColumnReader(const Type& _readType, const Type& 
fileType,
+                                       StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertToStringVariantColumnReader(_readType, fileType, stripe, 
_throwOnOverflow) {}
+    size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues) 
override;
+  };
+
+  template <typename FileTypeBatch>
+  size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+      ColumnVectorBatch& rowBatch, uint64_t numValues) {
+    size_t size = 0;
+    strBuffer.resize(numValues);
+    const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+    if (readType.getKind() == STRING) {
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          size += strBuffer[i].size();
+        }
+      }
+    } else if (readType.getKind() == VARCHAR) {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    } else {
+      const auto maxLength = readType.getMaximumLength();
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          strBuffer[i] = std::to_string(srcBatch.data[i]);
+          if (strBuffer[i].size() > maxLength) {
+            strBuffer[i].resize(maxLength);
+          } else {
+            strBuffer[i].resize(maxLength, ' ');
+          }
+          size += strBuffer[i].size();
+        }
+      }
+    }
+    return size;
+  }
+  template <typename FileTypeBatch, typename ReadTypeBatch, bool 
isFileTypeDouble>
+  class NumericToDecimalColumnReader : public ConvertColumnReader {
+   public:
+    NumericToDecimalColumnReader(const Type& _readType, const Type& fileType, 
StripeStreams& stripe,
+                                 bool _throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+      precision = static_cast<int32_t>(readType.getPrecision());
+      scale = static_cast<int32_t>(readType.getScale());
+    }
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override {
+      ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const 
FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+      dstBatch.precision = precision;
+      dstBatch.scale = scale;
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          if constexpr (isFileTypeDouble) {
+            convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+          } else {
+            convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+          }
+        }
+      }
+    }
+
+   private:
+    template <typename srcType>
+    void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType 
value) {
+      std::string strValue = std::to_string(value);
+      int32_t fromScale = 0;
+      int32_t fromPrecision = static_cast<int32_t>(strValue.length());
+      Int128 i128 = 0;
+      for (size_t i = 0; i < strValue.length(); ++i) {
+        auto c = strValue[i];
+        if (c == '.') {
+          fromScale = static_cast<int32_t>(strValue.length() - i - 1);
+          fromPrecision -= 1;
+          continue;
+        }
+        i128 *= 10;
+        i128 += c - '0';
+      }
+      auto result = convertDecimal(i128, fromPrecision, fromScale, precision, 
scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, 
throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, 
Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, 
idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    template <typename srcType>
+    void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, 
srcType value) {
+      int fromScale = 0;
+      int fromPrecision = 1;
+      for (srcType tmp = value; tmp /= 10; ++fromPrecision)
+        ;
+      auto result = convertDecimal(value, fromPrecision, fromScale, precision, 
scale);
+      if (result.first) {
+        handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx, 
throwOnOverflow);
+      } else {
+        if constexpr (std::is_same<ReadTypeBatch, 
Decimal64VectorBatch>::value) {
+          if (!result.second.fitsInLong()) {
+            handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, 
idx, throwOnOverflow);
+          } else {
+            dstBatch.values[idx] = result.second.toLong();
+          }
+        } else {
+          dstBatch.values[idx] = result.second;
+        }
+      }
+    }
+
+    int32_t precision;
+    int32_t scale;
+  };
+
+  class ConvertToTimestampColumnReader : public ConvertColumnReader {
+   public:
+    ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+          readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ? 
getTimezoneByName("GMT")
+                                                                 : 
stripe.getReaderTimezone()),
+          needConvertTimezone(&readerTimezone != &getTimezoneByName("GMT")) {}
+
+   protected:
+    const orc::Timezone& readerTimezone;
+    const bool needConvertTimezone;
+  };
+
+  template <typename FileTypeBatch>
+  class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader 
{
+   public:
+    NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+                                   StripeStreams& stripe, bool 
_throwOnOverflow)
+        : ConvertToTimestampColumnReader(_readType, fileType, stripe, 
_throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override {
+      ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const 
FileTypeBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+        }
+      }
+    }
+
+   private:
+    template <typename FileType>
+    void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx, 
FileType value);
+  };
+
+  template <typename FileTypeBatch>
+  template <typename FileType>
+  void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+      TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+    if constexpr (std::is_floating_point<FileType>::value) {
+      if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+          value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+        handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+        return;
+      }
+      dstBatch.data[idx] = static_cast<int64_t>(value);
+      dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+          static_cast<double>(value - 
static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+      if (dstBatch.nanoseconds[idx] < 0) {
+        dstBatch.data[idx] -= 1;
+        dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+      }
+    } else {
+      dstBatch.data[idx] = value;
+      dstBatch.nanoseconds[idx] = 0;
+    }
+    if (needConvertTimezone) {
+      dstBatch.data[idx] = readerTimezone.convertFromUTC(dstBatch.data[idx]);

Review Comment:
   > Does this follow the Java impl as well?
   
   
https://github.com/apache/orc/blob/ec2ea9c6aff8b8515452df651f08695639c18cbb/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L1518-L1524
   
   please take a look



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [orc] ffacs commented on a diff in pull request #1500: ORC-1386: [C++] Support schema evolution from numeric to string group/decimal/timestamp

Reply via email to