ffacs commented on code in PR #1949:
URL: https://github.com/apache/orc/pull/1949#discussion_r1664221116


##########
c++/src/ConvertColumnReader.cc:
##########
@@ -801,6 +821,196 @@ namespace orc {
     }
   };
 
+  class StringVariantToTimestampColumnReader : public 
ConvertToTimestampColumnReader {
+   public:
+    StringVariantToTimestampColumnReader(const Type& readType, const Type& 
fileType,
+                                         StripeStreams& stripe, bool 
throwOnOverflow)
+        : ConvertToTimestampColumnReader(readType, fileType, stripe, 
throwOnOverflow) {}
+
+    void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) 
override {
+      ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+      const auto& srcBatch = *SafeCastBatchTo<const 
StringVectorBatch*>(data.get());
+      auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+
+      for (uint64_t i = 0; i < numValues; ++i) {
+        if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+          convertToTimestamp(dstBatch, i, std::string(srcBatch.data[i], 
srcBatch.length[i]));
+        }
+      }
+    }
+
+   private:
+    // Algorithm: http://howardhinnant.github.io/date_algorithms.html
+    int64_t days_from_epoch(int32_t y, int32_t m, int32_t d) {
+      y -= m <= 2;
+      int32_t era = y / 400;
+      int32_t yoe = y - era * 400;                                   // [0, 
399]
+      int32_t doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1;  // [0, 
365]
+      int32_t doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;           // [0, 
146096]
+      return 1ll * era * 146097 + doe - 719468;
+    }
+
+    std::optional<std::pair<int64_t, int64_t>> tryBestToParseFromString(
+        const std::string& timeStr) {
+      // timestamp_instant: yyyy-mm-dd hh:mm:ss[.xxx] timezone
+      // timestamp        : yyyy-mm-dd hh:mm:ss[.xxx]
+      int32_t year, month, day, hour, min, sec, nanos = 0;
+      int32_t matched = std::sscanf(timeStr.c_str(), "%4d-%2d-%2d 
%2d:%2d:%2d.%d", &year, &month,
+                                    &day, &hour, &min, &sec, &nanos);
+      if (matched != 6 && matched != 7) {
+        return std::nullopt;
+      }
+      if (nanos) {
+        if (nanos < 0 || nanos >= 1e9) {
+          return std::nullopt;
+        }
+        while (nanos < static_cast<int64_t>(1e8)) {
+          nanos *= 10;
+        }
+      }
+      int64_t daysSinceEpoch = days_from_epoch(year, month, day);
+      int64_t secondSinceEpoch = 60ll * (60 * (24L * daysSinceEpoch + hour) + 
min) + sec;
+      return std::make_optional(std::pair<int64_t, int64_t>{secondSinceEpoch, 
nanos});
+    }
+
+    void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
+                            const std::string& timeStr) {
+      auto timestamp = tryBestToParseFromString(timeStr);
+      if (!timestamp.has_value()) {
+        handleParseFromStringError(dstBatch, idx, throwOnOverflow, 
"Timestamp", timeStr);
+        return;
+      }
+      auto& [second, nanos] = timestamp.value();
+
+      if (isInstant) {
+        size_t pos = 0;  // get the name of timezone

Review Comment:
   Timestamp is a date and time without a time zone, which does not change 
based on the time zone of the reader. Extra timezone field won't change the 
result. That is `2014-12-12 6:00:00 any/timezone` equals `2014-12-12 6:00:00`, 
because they are the same wall clock time.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to