ffacs commented on code in PR #1500:
URL: https://github.com/apache/orc/pull/1500#discussion_r1216879773
##########
c++/src/ConvertColumnReader.cc:
##########
@@ -186,10 +186,289 @@ namespace orc {
}
};
+ class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ // cache converted string in the buffer
+ auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+ // contact string values to blob buffer of vector batch
+ auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+ dstBatch.blob.resize(totalLength);
+ char* blob = dstBatch.blob.data();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const auto size = strBuffer[i].size();
+ ::memcpy(blob, strBuffer[i].c_str(), size);
+ dstBatch.data[i] = blob;
+ dstBatch.length[i] = static_cast<int32_t>(size);
+ blob += size;
+ }
+ }
+ strBuffer.clear();
+ }
+
+ virtual size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t
numValues) = 0;
+
+ protected:
+ std::vector<std::string> strBuffer;
+ };
+
+ class BooleanToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ BooleanToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+
+ size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues)
override;
+ };
+
+ size_t
BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch&
rowBatch,
+ uint64_t
numValues) {
+ size_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const
BooleanVectorBatch*>(data.get());
+ std::string trueValue = "TRUE";
+ std::string falseValue = "FALSE";
+ if (readType.getKind() == CHAR) {
+ trueValue.resize(readType.getMaximumLength(), ' ');
+ falseValue.resize(readType.getMaximumLength(), ' ');
+ } else if (readType.getKind() == VARCHAR) {
+ trueValue = trueValue.substr(0, std::min(static_cast<uint64_t>(4),
readType.getMaximumLength()));
+ falseValue = falseValue.substr(0, std::min(static_cast<uint64_t>(5),
readType.getMaximumLength()));
+ }
+ // cast the bool value to string and truncate to the max length
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+ size += strBuffer[i].size();
+ }
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ NumericToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+ size_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t numValues)
override;
+ };
+
+ template <typename FileTypeBatch>
+ size_t NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+ ColumnVectorBatch& rowBatch, uint64_t numValues) {
+ size_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ strBuffer[i].resize(maxLength);
+ }
+ size += strBuffer[i].size();
+ }
+ }
+ } else {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ strBuffer[i].resize(maxLength);
+ } else {
+ strBuffer[i].resize(maxLength, ' ');
+ }
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ return size;
+ }
+ template <typename FileTypeBatch, typename ReadTypeBatch, bool
isFileTypeDouble>
+ class NumericToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ NumericToDecimalColumnReader(const Type& _readType, const Type& fileType,
StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ precision = static_cast<int32_t>(readType.getPrecision());
+ scale = static_cast<int32_t>(readType.getScale());
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const
FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ dstBatch.precision = precision;
+ dstBatch.scale = scale;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (isFileTypeDouble) {
+ convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+ } else {
+ convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+ }
+
+ private:
+ template <typename srcType>
+ void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, srcType
value) {
+ std::string strValue = std::to_string(value);
Review Comment:
> String conversion would be slow. Can we simply convert it into an integer
and reuse the integer code path?
The integer portion of a double may larger then std::
numeric_limits<uint64_t>::max(), so i convert it to Int128 first.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]