ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1209461269
##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
}
};
+ class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ // cache converted string in the buffer
+ auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+ // contact string values to blob buffer of vector batch
+ auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+ dstBatch.blob.resize(totalLength);
+ char* blob = dstBatch.blob.data();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const auto size = strBuffer[i].size();
+ ::memcpy(blob, strBuffer[i].c_str(), size);
+ dstBatch.data[i] = blob;
+ dstBatch.length[i] = static_cast<int32_t>(size);
+ blob += size;
+ }
+ }
+ strBuffer.clear();
+ }
+
+ virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t
numValues) = 0;
+
+ protected:
+ std::vector<std::string> strBuffer;
+ };
+
+ class BooleanToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ BooleanToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+
+ size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues)
override;
+ };
+
+ size_t
BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch&
rowBatch,
+ uint64_t
numValues) {
+ size_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const
BooleanVectorBatch*>(data.get());
+ std::string trueValue = "TRUE";
+ std::string falseValue = "FALSE";
+ if (readType.getKind() == CHAR) {
+ trueValue.resize(readType.getMaximumLength(), ' ');
+ falseValue.resize(readType.getMaximumLength(), ' ');
+ } else if (readType.getKind() == VARCHAR) {
+ trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4),
readType.getMaximumLength()));
+ falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5),
readType.getMaximumLength()));
+ }
+ // cast the bool value to string and truncate to the max length
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+ size += strBuffer[i].size();
+ }
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ NumericToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+ size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues)
override;
+ };
+
+ template <typename FileTypeBatch>
+ size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+ ColumnVectorBatch& rowBatch, uint64_t numValues) {
+ size_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ strBuffer[i].resize(maxLength);
+ }
+ size += strBuffer[i].size();
+ }
+ }
+ } else {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ strBuffer[i].resize(maxLength);
+ } else {
+ strBuffer[i].resize(maxLength, ' ');
+ }
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ return size;
+ }
+ template <typename FileTypeBatch, typename ReadTypeBatch, bool
isFileTypeDouble>
+ class NumericToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ NumericToDecimalColumnReader(const Type& _readType, const Type& fileType,
StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ precision = static_cast<int32_t>(readType.getPrecision());
+ scale = static_cast<int32_t>(readType.getScale());
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const
FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ dstBatch.precision = precision;
+ dstBatch.scale = scale;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (isFileTypeDouble) {
+ convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+ } else {
+ convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+ }
+
+ private:
+ template <typename srcType>
+ void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType
value) {
+ std::string strValue = std::to_string(value);
+ int32_t fromScale = 0;
+ int32_t fromPrecision = static_cast<int32_t>(strValue.length());
+ Int128 i128 = 0;
+ for (size_t i = 0; i < strValue.length(); ++i) {
+ auto c = strValue[i];
+ if (c == '.') {
+ fromScale = static_cast<int32_t>(strValue.length() - i - 1);
+ fromPrecision -= 1;
+ continue;
+ }
+ i128 *= 10;
+ i128 += c - '0';
+ }
+ auto result = convertDecimal(i128, fromPrecision, fromScale, precision,
scale);
+ if (result.first) {
+ handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx,
throwOnOverflow);
+ } else {
+ if constexpr (std::is_same<ReadTypeBatch,
Decimal64VectorBatch>::value) {
+ if (!result.second.fitsInLong()) {
+ handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch,
idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.second.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = result.second;
+ }
+ }
+ }
+
+ template <typename srcType>
+ void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx,
srcType value) {
+ int fromScale = 0;
+ int fromPrecision = 1;
+ for (srcType tmp = value; tmp /= 10; ++fromPrecision)
+ ;
+ auto result = convertDecimal(value, fromPrecision, fromScale, precision,
scale);
+ if (result.first) {
+ handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch, idx,
throwOnOverflow);
+ } else {
+ if constexpr (std::is_same<ReadTypeBatch,
Decimal64VectorBatch>::value) {
+ if (!result.second.fitsInLong()) {
+ handleOverflow<srcType, decltype(dstBatch.values[idx])>(dstBatch,
idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.second.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = result.second;
+ }
+ }
+ }
+
+ int32_t precision;
+ int32_t scale;
+ };
+
+ class ConvertToTimestampColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+ readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ?
getTimezoneByName("GMT")
+ :
stripe.getReaderTimezone()),
+ needConvertTimezone(&readerTimezone != &getTimezoneByName("GMT")) {}
+
+ protected:
+ const orc::Timezone& readerTimezone;
+ const bool needConvertTimezone;
+ };
+
+ template <typename FileTypeBatch>
+ class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader
{
+ public:
+ NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToTimestampColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const
FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+
+ private:
+ template <typename FileType>
+ void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
FileType value);
+ };
+
+ template <typename FileTypeBatch>
+ template <typename FileType>
+ void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+ TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+ if constexpr (std::is_floating_point<FileType>::value) {
+ if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+ value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+ handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ dstBatch.data[idx] = static_cast<int64_t>(value);
+ dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+ static_cast<double>(value -
static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+ if (dstBatch.nanoseconds[idx] < 0) {
+ dstBatch.data[idx] -= 1;
+ dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+ }
+ } else {
+ dstBatch.data[idx] = value;
+ dstBatch.nanoseconds[idx] = 0;
+ }
+ if (needConvertTimezone) {
+ dstBatch.data[idx] = readerTimezone.convertFromUTC(dstBatch.data[idx]);
Review Comment:
> Does this follow the Java impl as well?
https://github.com/apache/orc/blob/ec2ea9c6aff8b8515452df651f08695639c18cbb/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java#L1518-L1524
please take a look
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]