This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.9
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.9 by this push:
new 7994c46b0 ORC-1386: [C++] Support schema evolution from numeric to
string group/decimal/timestamp
7994c46b0 is described below
commit 7994c46b0a073d8b5ec2ccf1fcea0532d334c9f5
Author: ffacs <[email protected]>
AuthorDate: Mon Jun 19 00:01:03 2023 +0800
ORC-1386: [C++] Support schema evolution from numeric to string
group/decimal/timestamp
### What changes were proposed in this pull request?
support conversion from numeric to string group/decimal/timestamp
### Why are the changes needed?
To support schema evolution in c++
### How was this patch tested?
UT passed
Closes #1500 from ffacs/ORC-1386.
Lead-authored-by: ffacs <[email protected]>
Co-authored-by: ffacs <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
(cherry picked from commit b6d595b4c827a1d2834ab0ea2531e811eb4c8fb2)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/include/orc/Int128.hh | 30 ++
c++/src/ConvertColumnReader.cc | 517 +++++++++++++++++++++++++++------
c++/src/Int128.cc | 97 +++++++
c++/src/SchemaEvolution.cc | 18 +-
c++/src/Timezone.cc | 6 +
c++/src/Timezone.hh | 5 +
c++/test/TestConvertColumnReader.cc | 380 +++++++++++++++++++++++-
c++/test/TestInt128.cc | 558 ++++++++++++++++++++++++++++++++++++
c++/test/TestSchemaEvolution.cc | 24 ++
c++/test/TestTimezone.cc | 34 +++
10 files changed, 1582 insertions(+), 87 deletions(-)
diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh
index 52a718237..81cee0538 100644
--- a/c++/include/orc/Int128.hh
+++ b/c++/include/orc/Int128.hh
@@ -366,5 +366,35 @@ namespace orc {
* @return the scaled value
*/
Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power);
+
+ /**
+ * Converts decimal value to different precision/scale
+ * @param value the Int128 value to convert
+ * @param fromScale the scale of the value
+ * @param toPrecision the precision to convert to
+ * @param toScale the scale to convert to
+ * @param round whether to round the value or truncate
+ * @return whether the conversion overflows and the converted value if does
not overflow
+ */
+ std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale,
int32_t toPrecision,
+ int32_t toScale, bool round = true);
+
+ /**
+ * Converts a float value to decimal
+ * @param value the float value to convert
+ * @param precision the precision of the decimal
+ * @param scale the scale of the decimal
+ * @return whether the conversion overflows and the converted value if does
not overflow
+ */
+ template <typename T>
+ std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>>
convertDecimal(
+ T value, int32_t precision, int32_t scale);
+
+ extern template std::pair<bool, Int128> convertDecimal<float>(float value,
int32_t precision,
+ int32_t scale);
+
+ extern template std::pair<bool, Int128> convertDecimal<double>(double value,
int32_t precision,
+ int32_t
scale);
+
} // namespace orc
#endif
diff --git a/c++/src/ConvertColumnReader.cc b/c++/src/ConvertColumnReader.cc
index c929b69f1..6718fa1cd 100644
--- a/c++/src/ConvertColumnReader.cc
+++ b/c++/src/ConvertColumnReader.cc
@@ -186,10 +186,302 @@ namespace orc {
}
};
+ class ConvertToStringVariantColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override;
+
+ virtual uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t
numValues) = 0;
+
+ protected:
+ std::vector<std::string> strBuffer;
+ };
+
+ void ConvertToStringVariantColumnReader::next(ColumnVectorBatch& rowBatch,
uint64_t numValues,
+ char* notNull) {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ // cache converted string in the buffer
+ auto totalLength = convertToStrBuffer(rowBatch, numValues);
+
+ // contact string values to blob buffer of vector batch
+ auto& dstBatch = *SafeCastBatchTo<StringVectorBatch*>(&rowBatch);
+ dstBatch.blob.resize(totalLength);
+ char* blob = dstBatch.blob.data();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ const auto size = strBuffer[i].size();
+ ::memcpy(blob, strBuffer[i].c_str(), size);
+ dstBatch.data[i] = blob;
+ dstBatch.length[i] = static_cast<int32_t>(size);
+ blob += size;
+ }
+ }
+ strBuffer.clear();
+ }
+
+ class BooleanToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ BooleanToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {
+ trueValue = "TRUE";
+ falseValue = "FALSE";
+ if (readType.getKind() == CHAR || readType.getKind() == VARCHAR) {
+ if (readType.getMaximumLength() < 5) {
+ throw SchemaEvolutionError("Invalid maximum length for boolean type:
" +
+
std::to_string(readType.getMaximumLength()));
+ }
+ if (readType.getKind() == CHAR) {
+ trueValue.resize(readType.getMaximumLength(), ' ');
+ falseValue.resize(readType.getMaximumLength(), ' ');
+ }
+ }
+ }
+
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t
numValues) override;
+
+ private:
+ std::string trueValue;
+ std::string falseValue;
+ };
+
+ uint64_t
BooleanToStringVariantColumnReader::convertToStrBuffer(ColumnVectorBatch&
rowBatch,
+ uint64_t
numValues) {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const
BooleanVectorBatch*>(data.get());
+ // cast the bool value to string
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = (srcBatch.data[i] ? trueValue : falseValue);
+ size += strBuffer[i].size();
+ }
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToStringVariantColumnReader : public
ConvertToStringVariantColumnReader {
+ public:
+ NumericToStringVariantColumnReader(const Type& _readType, const Type&
fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToStringVariantColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+ uint64_t convertToStrBuffer(ColumnVectorBatch& rowBatch, uint64_t
numValues) override;
+ };
+
+ template <typename FileTypeBatch>
+ uint64_t
NumericToStringVariantColumnReader<FileTypeBatch>::convertToStrBuffer(
+ ColumnVectorBatch& rowBatch, uint64_t numValues) {
+ uint64_t size = 0;
+ strBuffer.resize(numValues);
+ const auto& srcBatch = *SafeCastBatchTo<const FileTypeBatch*>(data.get());
+ if (readType.getKind() == STRING) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ size += strBuffer[i].size();
+ }
+ }
+ } else if (readType.getKind() == VARCHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch,
i, throwOnOverflow);
+ } else {
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ } else if (readType.getKind() == CHAR) {
+ const auto maxLength = readType.getMaximumLength();
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ strBuffer[i] = std::to_string(srcBatch.data[i]);
+ if (strBuffer[i].size() > maxLength) {
+ handleOverflow<decltype(srcBatch.data[i]), std::string>(rowBatch,
i, throwOnOverflow);
+ } else {
+ strBuffer[i].resize(maxLength, ' ');
+ size += strBuffer[i].size();
+ }
+ }
+ }
+ } else {
+ throw SchemaEvolutionError("Invalid type for numeric to string
conversion: " +
+ readType.toString());
+ }
+ return size;
+ }
+
+ template <typename FileTypeBatch, typename ReadTypeBatch, bool
isFloatingFileType>
+ class NumericToDecimalColumnReader : public ConvertColumnReader {
+ public:
+ NumericToDecimalColumnReader(const Type& _readType, const Type& fileType,
StripeStreams& stripe,
+ bool _throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow) {
+ precision = static_cast<int32_t>(readType.getPrecision());
+ scale = static_cast<int32_t>(readType.getScale());
+ bool overflow = false;
+ upperBound = scaleUpInt128ByPowerOfTen(1, precision, overflow);
+ }
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const
FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<ReadTypeBatch*>(&rowBatch);
+ dstBatch.precision = precision;
+ dstBatch.scale = scale;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ if constexpr (isFloatingFileType) {
+ convertDoubleToDecimal(dstBatch, i, srcBatch.data[i]);
+ } else {
+ convertIntegerToDecimal(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+ }
+
+ private:
+ template <typename SrcType>
+ void convertDoubleToDecimal(ReadTypeBatch& dstBatch, uint64_t idx, SrcType
value) {
+ const auto result = convertDecimal(value, precision, scale);
+ Int128 i128 = result.second;
+ if (result.first) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx,
throwOnOverflow);
+ return;
+ }
+
+ if constexpr (std::is_same<ReadTypeBatch, Decimal64VectorBatch>::value) {
+ if (!i128.fitsInLong()) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch,
idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = i128.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = i128;
+ }
+ }
+
+ template <typename SrcType>
+ void convertIntegerToDecimal(ReadTypeBatch& dstBatch, uint64_t idx,
SrcType value) {
+ int fromScale = 0;
+ auto result = convertDecimal(value, fromScale, precision, scale);
+ if (result.first) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch, idx,
throwOnOverflow);
+ } else {
+ if constexpr (std::is_same<ReadTypeBatch,
Decimal64VectorBatch>::value) {
+ if (!result.second.fitsInLong()) {
+ handleOverflow<SrcType, decltype(dstBatch.values[idx])>(dstBatch,
idx, throwOnOverflow);
+ } else {
+ dstBatch.values[idx] = result.second.toLong();
+ }
+ } else {
+ dstBatch.values[idx] = result.second;
+ }
+ }
+ }
+
+ int32_t precision;
+ int32_t scale;
+ int64_t scaleMultiplier;
+ Int128 upperBound;
+ };
+
+ class ConvertToTimestampColumnReader : public ConvertColumnReader {
+ public:
+ ConvertToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertColumnReader(_readType, fileType, stripe, _throwOnOverflow),
+ readerTimezone(readType.getKind() == TIMESTAMP_INSTANT ?
getTimezoneByName("GMT")
+ :
stripe.getReaderTimezone()),
+ needConvertTimezone(&readerTimezone != &getTimezoneByName("GMT")) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override;
+
+ protected:
+ const orc::Timezone& readerTimezone;
+ const bool needConvertTimezone;
+ };
+
+ // avoid emitting vtable in every translation unit
+ void ConvertToTimestampColumnReader::next(ColumnVectorBatch& rowBatch,
uint64_t numValues,
+ char* notNull) {
+ ConvertColumnReader::next(rowBatch, numValues, notNull);
+ }
+
+ template <typename FileTypeBatch>
+ class NumericToTimestampColumnReader : public ConvertToTimestampColumnReader
{
+ public:
+ NumericToTimestampColumnReader(const Type& _readType, const Type& fileType,
+ StripeStreams& stripe, bool
_throwOnOverflow)
+ : ConvertToTimestampColumnReader(_readType, fileType, stripe,
_throwOnOverflow) {}
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull)
override {
+ ConvertToTimestampColumnReader::next(rowBatch, numValues, notNull);
+
+ const auto& srcBatch = *SafeCastBatchTo<const
FileTypeBatch*>(data.get());
+ auto& dstBatch = *SafeCastBatchTo<TimestampVectorBatch*>(&rowBatch);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!rowBatch.hasNulls || rowBatch.notNull[i]) {
+ convertToTimestamp(dstBatch, i, srcBatch.data[i]);
+ }
+ }
+ }
+
+ private:
+ template <typename FileType>
+ void convertToTimestamp(TimestampVectorBatch& dstBatch, uint64_t idx,
FileType value);
+ };
+
+ template <typename FileTypeBatch>
+ template <typename FileType>
+ void NumericToTimestampColumnReader<FileTypeBatch>::convertToTimestamp(
+ TimestampVectorBatch& dstBatch, uint64_t idx, FileType value) {
+ if constexpr (std::is_floating_point<FileType>::value) {
+ if (value > static_cast<FileType>(std::numeric_limits<int64_t>::max()) ||
+ value < static_cast<FileType>(std::numeric_limits<int64_t>::min())) {
+ handleOverflow<FileType, int64_t>(dstBatch, idx, throwOnOverflow);
+ return;
+ }
+ dstBatch.data[idx] = static_cast<int64_t>(value);
+ dstBatch.nanoseconds[idx] = static_cast<int32_t>(
+ static_cast<double>(value -
static_cast<FileType>(dstBatch.data[idx])) * 1e9);
+ if (dstBatch.nanoseconds[idx] < 0) {
+ dstBatch.data[idx] -= 1;
+ dstBatch.nanoseconds[idx] += static_cast<int32_t>(1e9);
+ }
+ } else {
+ dstBatch.data[idx] = value;
+ dstBatch.nanoseconds[idx] = 0;
+ }
+ if (needConvertTimezone) {
+ dstBatch.data[idx] = readerTimezone.convertFromUTC(dstBatch.data[idx]);
+ }
+ }
+
#define DEFINE_NUMERIC_CONVERT_READER(FROM, TO, TYPE) \
using FROM##To##TO##ColumnReader = \
NumericConvertColumnReader<FROM##VectorBatch, TO##VectorBatch, TYPE>;
+#define DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(FROM, TO) \
+ using FROM##To##TO##ColumnReader =
NumericToStringVariantColumnReader<FROM##VectorBatch>;
+
+#define DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(FROM, IS_FROM_FLOATING)
\
+ using FROM##To##Decimal64##ColumnReader =
\
+ NumericToDecimalColumnReader<FROM##VectorBatch, Decimal64VectorBatch,
IS_FROM_FLOATING>; \
+ using FROM##To##Decimal128##ColumnReader =
\
+ NumericToDecimalColumnReader<FROM##VectorBatch, Decimal128VectorBatch,
IS_FROM_FLOATING>;
+
+#define DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(FROM) \
+ using FROM##ToTimestampColumnReader =
NumericToTimestampColumnReader<FROM##VectorBatch>;
+
DEFINE_NUMERIC_CONVERT_READER(Boolean, Byte, int8_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Short, int16_t)
DEFINE_NUMERIC_CONVERT_READER(Boolean, Int, int32_t)
@@ -235,10 +527,64 @@ namespace orc {
DEFINE_NUMERIC_CONVERT_READER(Int, Double, double)
DEFINE_NUMERIC_CONVERT_READER(Long, Double, double)
+ // Numeric to String/Char
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, String)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Char)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Byte, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Short, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Int, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Long, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Float, Varchar)
+ DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER(Double, Varchar)
+ using BooleanToStringColumnReader = BooleanToStringVariantColumnReader;
+ using BooleanToCharColumnReader = BooleanToStringVariantColumnReader;
+ using BooleanToVarcharColumnReader = BooleanToStringVariantColumnReader;
+
+ // Numeric to Decimal
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Boolean, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Byte, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Short, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Int, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Long, false)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Float, true)
+ DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER(Double, true)
+
+ // Numeric to Timestamp
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Boolean)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Byte)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Short)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Int)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Long)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Float)
+ DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER(Double)
+
#define CASE_CREATE_READER(TYPE, CONVERT) \
case TYPE: \
return std::make_unique<CONVERT##ColumnReader>(_readType, fileType,
stripe, throwOnOverflow);
+ const static int32_t MAX_PRECISION_64 = 18;
+
+#define CASE_CREATE_DECIMAL_READER(FROM)
\
+ case DECIMAL: {
\
+ if (_readType.getPrecision() > 0 && _readType.getPrecision() <=
MAX_PRECISION_64) { \
+ return std::make_unique<FROM##ToDecimal64ColumnReader>(_readType,
fileType, stripe, \
+ throwOnOverflow);
\
+ } else {
\
+ return std::make_unique<FROM##ToDecimal128ColumnReader>(_readType,
fileType, stripe, \
+
throwOnOverflow); \
+ }
\
+ }
+
#define CASE_EXCEPTION
\
default:
\
throw SchemaEvolutionError("Cannot convert from " + fileType.toString() +
" to " + \
@@ -257,169 +603,169 @@ namespace orc {
switch (fileType.getKind()) {
case BOOLEAN: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BYTE, BooleanToByte);
- CASE_CREATE_READER(SHORT, BooleanToShort);
- CASE_CREATE_READER(INT, BooleanToInt);
- CASE_CREATE_READER(LONG, BooleanToLong);
- CASE_CREATE_READER(FLOAT, BooleanToFloat);
- CASE_CREATE_READER(DOUBLE, BooleanToDouble);
+ CASE_CREATE_READER(BYTE, BooleanToByte)
+ CASE_CREATE_READER(SHORT, BooleanToShort)
+ CASE_CREATE_READER(INT, BooleanToInt)
+ CASE_CREATE_READER(LONG, BooleanToLong)
+ CASE_CREATE_READER(FLOAT, BooleanToFloat)
+ CASE_CREATE_READER(DOUBLE, BooleanToDouble)
+ CASE_CREATE_READER(STRING, BooleanToString)
+ CASE_CREATE_READER(CHAR, BooleanToChar)
+ CASE_CREATE_READER(VARCHAR, BooleanToVarchar)
+ CASE_CREATE_DECIMAL_READER(Boolean)
+ CASE_CREATE_READER(TIMESTAMP, BooleanToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, BooleanToTimestamp)
case BOOLEAN:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case BYTE: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, ByteToBoolean);
- CASE_CREATE_READER(SHORT, ByteToShort);
- CASE_CREATE_READER(INT, ByteToInt);
- CASE_CREATE_READER(LONG, ByteToLong);
- CASE_CREATE_READER(FLOAT, ByteToFloat);
- CASE_CREATE_READER(DOUBLE, ByteToDouble);
+ CASE_CREATE_READER(BOOLEAN, ByteToBoolean)
+ CASE_CREATE_READER(SHORT, ByteToShort)
+ CASE_CREATE_READER(INT, ByteToInt)
+ CASE_CREATE_READER(LONG, ByteToLong)
+ CASE_CREATE_READER(FLOAT, ByteToFloat)
+ CASE_CREATE_READER(DOUBLE, ByteToDouble)
+ CASE_CREATE_READER(STRING, ByteToString)
+ CASE_CREATE_READER(CHAR, ByteToChar)
+ CASE_CREATE_READER(VARCHAR, ByteToVarchar)
+ CASE_CREATE_DECIMAL_READER(Byte)
+ CASE_CREATE_READER(TIMESTAMP, ByteToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, ByteToTimestamp)
case BYTE:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case SHORT: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, ShortToBoolean);
- CASE_CREATE_READER(BYTE, ShortToByte);
- CASE_CREATE_READER(INT, ShortToInt);
- CASE_CREATE_READER(LONG, ShortToLong);
- CASE_CREATE_READER(FLOAT, ShortToFloat);
- CASE_CREATE_READER(DOUBLE, ShortToDouble);
+ CASE_CREATE_READER(BOOLEAN, ShortToBoolean)
+ CASE_CREATE_READER(BYTE, ShortToByte)
+ CASE_CREATE_READER(INT, ShortToInt)
+ CASE_CREATE_READER(LONG, ShortToLong)
+ CASE_CREATE_READER(FLOAT, ShortToFloat)
+ CASE_CREATE_READER(DOUBLE, ShortToDouble)
+ CASE_CREATE_READER(STRING, ShortToString)
+ CASE_CREATE_READER(CHAR, ShortToChar)
+ CASE_CREATE_READER(VARCHAR, ShortToVarchar)
+ CASE_CREATE_DECIMAL_READER(Short)
+ CASE_CREATE_READER(TIMESTAMP, ShortToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, ShortToTimestamp)
case SHORT:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case INT: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, IntToBoolean);
- CASE_CREATE_READER(BYTE, IntToByte);
- CASE_CREATE_READER(SHORT, IntToShort);
- CASE_CREATE_READER(LONG, IntToLong);
- CASE_CREATE_READER(FLOAT, IntToFloat);
- CASE_CREATE_READER(DOUBLE, IntToDouble);
+ CASE_CREATE_READER(BOOLEAN, IntToBoolean)
+ CASE_CREATE_READER(BYTE, IntToByte)
+ CASE_CREATE_READER(SHORT, IntToShort)
+ CASE_CREATE_READER(LONG, IntToLong)
+ CASE_CREATE_READER(FLOAT, IntToFloat)
+ CASE_CREATE_READER(DOUBLE, IntToDouble)
+ CASE_CREATE_READER(STRING, IntToString)
+ CASE_CREATE_READER(CHAR, IntToChar)
+ CASE_CREATE_READER(VARCHAR, IntToVarchar)
+ CASE_CREATE_DECIMAL_READER(Int)
+ CASE_CREATE_READER(TIMESTAMP, IntToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, IntToTimestamp)
case INT:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case LONG: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, LongToBoolean);
- CASE_CREATE_READER(BYTE, LongToByte);
- CASE_CREATE_READER(SHORT, LongToShort);
- CASE_CREATE_READER(INT, LongToInt);
- CASE_CREATE_READER(FLOAT, LongToFloat);
- CASE_CREATE_READER(DOUBLE, LongToDouble);
+ CASE_CREATE_READER(BOOLEAN, LongToBoolean)
+ CASE_CREATE_READER(BYTE, LongToByte)
+ CASE_CREATE_READER(SHORT, LongToShort)
+ CASE_CREATE_READER(INT, LongToInt)
+ CASE_CREATE_READER(FLOAT, LongToFloat)
+ CASE_CREATE_READER(DOUBLE, LongToDouble)
+ CASE_CREATE_READER(STRING, LongToString)
+ CASE_CREATE_READER(CHAR, LongToChar)
+ CASE_CREATE_READER(VARCHAR, LongToVarchar)
+ CASE_CREATE_DECIMAL_READER(Long)
+ CASE_CREATE_READER(TIMESTAMP, LongToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, LongToTimestamp)
case LONG:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case FLOAT: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, FloatToBoolean);
- CASE_CREATE_READER(BYTE, FloatToByte);
- CASE_CREATE_READER(SHORT, FloatToShort);
- CASE_CREATE_READER(INT, FloatToInt);
- CASE_CREATE_READER(LONG, FloatToLong);
- CASE_CREATE_READER(DOUBLE, FloatToDouble);
+ CASE_CREATE_READER(BOOLEAN, FloatToBoolean)
+ CASE_CREATE_READER(BYTE, FloatToByte)
+ CASE_CREATE_READER(SHORT, FloatToShort)
+ CASE_CREATE_READER(INT, FloatToInt)
+ CASE_CREATE_READER(LONG, FloatToLong)
+ CASE_CREATE_READER(DOUBLE, FloatToDouble)
+ CASE_CREATE_READER(STRING, FloatToString)
+ CASE_CREATE_READER(CHAR, FloatToChar)
+ CASE_CREATE_READER(VARCHAR, FloatToVarchar)
+ CASE_CREATE_DECIMAL_READER(Float)
+ CASE_CREATE_READER(TIMESTAMP, FloatToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, FloatToTimestamp)
case FLOAT:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
case DOUBLE: {
switch (_readType.getKind()) {
- CASE_CREATE_READER(BOOLEAN, DoubleToBoolean);
- CASE_CREATE_READER(BYTE, DoubleToByte);
- CASE_CREATE_READER(SHORT, DoubleToShort);
- CASE_CREATE_READER(INT, DoubleToInt);
- CASE_CREATE_READER(LONG, DoubleToLong);
- CASE_CREATE_READER(FLOAT, DoubleToFloat);
+ CASE_CREATE_READER(BOOLEAN, DoubleToBoolean)
+ CASE_CREATE_READER(BYTE, DoubleToByte)
+ CASE_CREATE_READER(SHORT, DoubleToShort)
+ CASE_CREATE_READER(INT, DoubleToInt)
+ CASE_CREATE_READER(LONG, DoubleToLong)
+ CASE_CREATE_READER(FLOAT, DoubleToFloat)
+ CASE_CREATE_READER(STRING, DoubleToString)
+ CASE_CREATE_READER(CHAR, DoubleToChar)
+ CASE_CREATE_READER(VARCHAR, DoubleToVarchar)
+ CASE_CREATE_DECIMAL_READER(Double)
+ CASE_CREATE_READER(TIMESTAMP, DoubleToTimestamp)
+ CASE_CREATE_READER(TIMESTAMP_INSTANT, DoubleToTimestamp)
case DOUBLE:
- case STRING:
case BINARY:
- case TIMESTAMP:
case LIST:
case MAP:
case STRUCT:
case UNION:
- case DECIMAL:
case DATE:
- case VARCHAR:
- case CHAR:
- case TIMESTAMP_INSTANT:
CASE_EXCEPTION
}
}
@@ -440,6 +786,9 @@ namespace orc {
}
#undef DEFINE_NUMERIC_CONVERT_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_STRING_VARINT_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_DECIMAL_READER
+#undef DEFINE_NUMERIC_CONVERT_TO_TIMESTAMP_READER
#undef CASE_CREATE_READER
#undef CASE_EXCEPTION
diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc
index 0a36fe669..327930975 100644
--- a/c++/src/Int128.cc
+++ b/c++/src/Int128.cc
@@ -436,6 +436,7 @@ namespace orc {
}
const static int32_t MAX_PRECISION_64 = 18;
+ const static int32_t MAX_PRECISION_128 = 38;
const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1,
10,
100,
@@ -488,4 +489,100 @@ namespace orc {
return value;
}
+ std::pair<bool, Int128> convertDecimal(Int128 value, int32_t fromScale,
int32_t toPrecision,
+ int32_t toScale, bool round) {
+ if (toPrecision > MAX_PRECISION_128 || toPrecision < 1 || toScale < 0 ||
+ toScale > toPrecision || fromScale < 0 ||
+ std::abs(fromScale - toScale) > MAX_PRECISION_128) {
+ std::stringstream buf;
+ buf << "Invalid argument: fromScale=" << fromScale << ", toPrecision="
<< toPrecision
+ << ", toScale=" << toScale;
+ throw std::invalid_argument(buf.str());
+ }
+ std::pair<bool, Int128> result;
+ bool negative = value < 0;
+ result.second = value.abs();
+ result.first = false;
+
+ Int128 upperBound = scaleUpInt128ByPowerOfTen(1, toPrecision,
result.first);
+ int8_t roundOffset = 0;
+ int32_t deltaScale = fromScale - toScale;
+
+ if (deltaScale > 0) {
+ Int128 scale = scaleUpInt128ByPowerOfTen(1, deltaScale, result.first),
remainder;
+ result.second = result.second.divide(scale, remainder);
+ remainder *= 2;
+ if (round && remainder >= scale) {
+ upperBound -= 1;
+ roundOffset = 1;
+ }
+ } else if (deltaScale < 0) {
+ if (result.second > upperBound) {
+ result.first = true;
+ return result;
+ }
+ result.second = scaleUpInt128ByPowerOfTen(result.second, -deltaScale,
result.first);
+ }
+
+ if (result.second > upperBound) {
+ result.first = true;
+ return result;
+ }
+
+ result.second += roundOffset;
+ if (negative) {
+ result.second *= -1;
+ }
+ return result;
+ }
+
+ template <typename T>
+ std::enable_if_t<std::is_floating_point_v<T>, std::pair<bool, Int128>>
convertDecimal(
+ T value, int32_t precision, int32_t scale) {
+ const static T upperbound = std::ldexp(static_cast<T>(1), 127);
+ const static T lowerbound = -upperbound;
+
+ std::pair<bool, Int128> result = {false, 0};
+ if (precision > MAX_PRECISION_128 || precision < 1 || scale > precision ||
scale < 0) {
+ result.first = true;
+ return result;
+ }
+
+ if (std::isnan(value) || value <= lowerbound || value >= upperbound) {
+ result.first = true;
+ return result;
+ }
+
+ bool isNegative = (value < 0);
+ Int128 i128, remainder;
+ value = std::fabs(value);
+ if (value >= std::ldexp(static_cast<T>(1.0), 64)) {
+ int64_t hi = static_cast<int64_t>(std::ldexp(value, -64));
+ uint64_t lo = static_cast<uint64_t>(value -
std::ldexp(static_cast<T>(hi), 64));
+ i128 = Int128(hi, lo);
+ } else {
+ i128 = Int128(0, static_cast<uint64_t>(value));
+ }
+ value = value - std::floor(value);
+
+ bool overflow = false;
+ i128 = scaleUpInt128ByPowerOfTen(i128, scale, overflow);
+ if (overflow || i128 >= scaleUpInt128ByPowerOfTen(1, precision, overflow))
{
+ result.first = true;
+ return result;
+ }
+
+ value = value * static_cast<T>(pow(10, scale));
+ i128 += static_cast<int64_t>(std::round(value));
+ if (isNegative) {
+ i128 = i128.negate();
+ }
+ result.second = i128;
+ return result;
+ }
+
+ template std::pair<bool, Int128> convertDecimal(float value, int32_t
precision, int32_t scale);
+
+ template std::pair<bool, Int128> convertDecimal(double value, int32_t
precision, int32_t scale);
+
} // namespace orc
diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc
index d694a49d4..cb6f0d002 100644
--- a/c++/src/SchemaEvolution.cc
+++ b/c++/src/SchemaEvolution.cc
@@ -55,6 +55,21 @@ namespace orc {
kind == FLOAT || kind == DOUBLE;
}
+ bool isStringVariant(const Type& type) {
+ auto kind = type.getKind();
+ return kind == STRING || kind == CHAR || kind == VARCHAR;
+ }
+
+ bool isDecimal(const Type& type) {
+ auto kind = type.getKind();
+ return kind == DECIMAL;
+ }
+
+ bool isTimestamp(const Type& type) {
+ auto kind = type.getKind();
+ return kind == TIMESTAMP || kind == TIMESTAMP_INSTANT;
+ }
+
struct ConversionCheckResult {
bool isValid;
bool needConvert;
@@ -79,7 +94,8 @@ namespace orc {
case LONG:
case FLOAT:
case DOUBLE: {
- ret.isValid = ret.needConvert = isNumeric(readType);
+ ret.isValid = ret.needConvert = isNumeric(readType) ||
isStringVariant(readType) ||
+ isDecimal(readType) ||
isTimestamp(readType);
break;
}
case DECIMAL:
diff --git a/c++/src/Timezone.cc b/c++/src/Timezone.cc
index 5b410a298..6628eba83 100644
--- a/c++/src/Timezone.cc
+++ b/c++/src/Timezone.cc
@@ -587,6 +587,12 @@ namespace orc {
return clk + getVariant(clk).gmtOffset;
}
+ int64_t convertFromUTC(int64_t clk) const override {
+ int64_t adjustedTime = clk - getVariant(clk).gmtOffset;
+ const auto& adjustedReader = getVariant(adjustedTime);
+ return clk - adjustedReader.gmtOffset;
+ }
+
private:
void parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset,
uint64_t variantCount,
uint64_t nameOffset, uint64_t nameCount);
diff --git a/c++/src/Timezone.hh b/c++/src/Timezone.hh
index bc32727f3..0716c5a3f 100644
--- a/c++/src/Timezone.hh
+++ b/c++/src/Timezone.hh
@@ -85,6 +85,11 @@ namespace orc {
* Convert wall clock time of current timezone to UTC timezone
*/
virtual int64_t convertToUTC(int64_t clk) const = 0;
+
+ /**
+ * Convert UTC timezone to wall clock time of current timezone
+ */
+ virtual int64_t convertFromUTC(int64_t clk) const = 0;
};
/**
diff --git a/c++/test/TestConvertColumnReader.cc
b/c++/test/TestConvertColumnReader.cc
index c756845cf..77b000dfb 100644
--- a/c++/test/TestConvertColumnReader.cc
+++ b/c++/test/TestConvertColumnReader.cc
@@ -29,6 +29,8 @@
namespace orc {
+ using BooleanVectorBatch = ByteVectorBatch;
+
static std::unique_ptr<Reader> createReader(MemoryPool& memoryPool,
std::unique_ptr<InputStream>
stream) {
ReaderOptions options;
@@ -37,7 +39,7 @@ namespace orc {
}
TEST(ConvertColumnReader, betweenNumericWithoutOverflows) {
- constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024;
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
@@ -97,7 +99,7 @@ namespace orc {
}
TEST(ConvertColumnReader, betweenNumricOverflows) {
- constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024;
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<t1:double,t2:bigint,t3:bigint>"));
@@ -146,4 +148,378 @@ namespace orc {
readBatch = rowReader->createRowBatch(2);
EXPECT_THROW(rowReader->next(*readBatch), SchemaEvolutionError);
}
+
+ // Test for converting from boolean to string/char/varchar
+ // Create a file with schema
struct<t1:boolean,t2:boolean,t3:boolean,t4:boolean,t5:boolean> and
+ // write 1024 rows with alternating true and false values. Read the file
with schema
+ // struct<t1:string,t2:char(3),t3:char(7),t4:varchar(3),t5:varchar(7)> and
verify that the values
+ // are "TRUE" and "FALSE".
+ TEST(ConvertColumnReader, booleanToString) {
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ std::unique_ptr<Type> fileType(Type::buildTypeFromString(
+ "struct<t1:boolean,t2:boolean,t3:boolean,t4:boolean,t5:boolean>"));
+ std::shared_ptr<Type> readType(Type::buildTypeFromString(
+
"struct<t1:string,t2:char(6),t3:char(7),t4:varchar(5),t5:varchar(7)>"));
+ WriterOptions options;
+ options.setUseTightNumericVector(true);
+ auto writer = createWriter(*fileType, &memStream, options);
+ auto batch = writer->createRowBatch(1024);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+ auto& c0 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[0]);
+ auto& c1 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[1]);
+ auto& c2 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[2]);
+ auto& c3 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[3]);
+ auto& c4 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[4]);
+
+ for (size_t i = 0; i < 1024; i++) {
+ c0.data[i] = static_cast<char>(i % 2 == 0 ? 0 : 1);
+ c1.data[i] = static_cast<char>(i % 3 == 0 ? 0 : 1);
+ c2.data[i] = static_cast<char>(i % 5 == 0 ? 0 : 1);
+ c3.data[i] = static_cast<char>(i % 7 == 0 ? 0 : 1);
+ c4.data[i] = static_cast<char>(i % 11 == 0 ? 0 : 1);
+ }
+
+ structBatch.numElements = c0.numElements = c1.numElements = c2.numElements
= c3.numElements =
+ c4.numElements = 1024;
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ auto pool = getDefaultPool();
+ auto reader = createReader(*pool, std::move(inStream));
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.setReadType(readType);
+ rowReaderOpts.setUseTightNumericVector(true);
+ auto rowReader = reader->createRowReader(rowReaderOpts);
+ auto readBatch = rowReader->createRowBatch(1024);
+ EXPECT_EQ(true, rowReader->next(*readBatch));
+ auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
+ auto& readC0 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[0]);
+ auto& readC1 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[1]);
+ auto& readC2 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[2]);
+ auto& readC3 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[3]);
+ auto& readC4 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[4]);
+
+ for (size_t i = 0; i < 1024; i++) {
+ EXPECT_EQ(std::string(readC0.data[i],
static_cast<size_t>(readC0.length[i])),
+ i % 2 == 0 ? "FALSE" : "TRUE");
+ EXPECT_EQ(std::string(readC1.data[i],
static_cast<size_t>(readC1.length[i])),
+ i % 3 == 0 ? "FALSE " : "TRUE ");
+ EXPECT_EQ(std::string(readC2.data[i],
static_cast<size_t>(readC2.length[i])),
+ i % 5 == 0 ? "FALSE " : "TRUE ");
+ EXPECT_EQ(std::string(readC3.data[i],
static_cast<size_t>(readC3.length[i])),
+ i % 7 == 0 ? "FALSE" : "TRUE");
+ EXPECT_EQ(std::string(readC4.data[i],
static_cast<size_t>(readC4.length[i])),
+ i % 11 == 0 ? "FALSE" : "TRUE");
+ }
+ }
+
+ TEST(ConvertColumnReader, TestConvertNumericToStringVariant) {
+ // Create a memory buffer to hold the ORC file data
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
+ constexpr int64_t TEST_CASES = 1024;
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ std::unique_ptr<Type> fileType(Type::buildTypeFromString(
+ "struct<t1:tinyint,t2:smallint,t3:int,t4:bigint,t5:float,t6:double,"
+ "t7:tinyint,t8:smallint,t9:int,t10:bigint,t11:float,t12:double,"
+ "t13:tinyint,t14:smallint,t15:int,t16:bigint,t17:float,t18:double"
+ ">"));
+ std::shared_ptr<Type> readType(Type::buildTypeFromString(
+ "struct<t1:string,t2:string,t3:string,t4:string,t5:string,t6:string,"
+ "t7:char(1),t8:char(2),t9:char(3),t10:char(4),t11:char(5),t12:char(6),"
+
"t13:varchar(1),t14:varchar(2),t15:varchar(3),t16:varchar(4),t17:varchar(5),t18:varchar(6)"
+ ">"));
+ WriterOptions options;
+ options.setUseTightNumericVector(true);
+ auto writer = createWriter(*fileType, &memStream, options);
+ auto batch = writer->createRowBatch(TEST_CASES);
+ auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
+
+ for (size_t i = 0; i < 3; i++) {
+ auto& col0 = dynamic_cast<ByteVectorBatch&>(*structBatch.fields[i * 6]);
+ auto& col1 = dynamic_cast<ShortVectorBatch&>(*structBatch.fields[i * 6 +
1]);
+ auto& col2 = dynamic_cast<IntVectorBatch&>(*structBatch.fields[i * 6 +
2]);
+ auto& col3 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[i * 6 +
3]);
+ auto& col4 = dynamic_cast<FloatVectorBatch&>(*structBatch.fields[i * 6 +
4]);
+ auto& col5 = dynamic_cast<DoubleVectorBatch&>(*structBatch.fields[i * 6
+ 5]);
+ for (int j = 0; j < TEST_CASES; j++) {
+ int flag = j % 2 == 0 ? -1 : 1;
+ uint64_t idx = static_cast<uint64_t>(j);
+ col0.data[idx] = static_cast<char>(flag * (j % 128));
+ col1.data[idx] = static_cast<short>(flag * (j % 32768));
+ col2.data[idx] = flag * j;
+ col3.data[idx] = flag * j;
+ col4.data[idx] = static_cast<float>(flag * j) * 1.234f;
+ col5.data[idx] = static_cast<double>(flag * j) * 1.234;
+ }
+ col0.numElements = col1.numElements = col2.numElements =
col3.numElements = col4.numElements =
+ col5.numElements = TEST_CASES;
+ }
+
+ structBatch.numElements = TEST_CASES;
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ auto pool = getDefaultPool();
+ auto reader = createReader(*pool, std::move(inStream));
+ RowReaderOptions rowReaderOpts;
+ rowReaderOpts.setReadType(readType);
+ rowReaderOpts.setUseTightNumericVector(true);
+ auto rowReader = reader->createRowReader(rowReaderOpts);
+ auto readBatch = rowReader->createRowBatch(TEST_CASES);
+ EXPECT_EQ(true, rowReader->next(*readBatch));
+ auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
+
+ // column 0 to 17
+ auto& readC1 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[0]);
+ auto& readC2 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[1]);
+ auto& readC3 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[2]);
+ auto& readC4 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[3]);
+ auto& readC5 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[4]);
+ auto& readC6 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[5]);
+ auto& readC7 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[6]);
+ auto& readC8 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[7]);
+ auto& readC9 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[8]);
+ auto& readC10 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[9]);
+ auto& readC11 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[10]);
+ auto& readC12 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[11]);
+ auto& readC13 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[12]);
+ auto& readC14 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[13]);
+ auto& readC15 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[14]);
+ auto& readC16 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[15]);
+ auto& readC17 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[16]);
+ auto& readC18 =
dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[17]);
+
+ std::vector<std::vector<std::string>> origin(30,
std::vector<std::string>(TEST_CASES));
+ for (int j = 0; j < TEST_CASES; j++) {
+ for (size_t k = 0; k < 5; k++) {
+ int flag = j % 2 == 0 ? -1 : 1;
+ uint64_t idx = static_cast<uint64_t>(j);
+ origin[6 * k + 0][idx] = std::to_string(static_cast<char>(flag * (j %
128)));
+ origin[6 * k + 1][idx] = std::to_string(static_cast<short>(flag * (j %
32768)));
+ origin[6 * k + 2][idx] = std::to_string(flag * j);
+ origin[6 * k + 3][idx] = std::to_string(flag * j);
+ origin[6 * k + 4][idx] = std::to_string(static_cast<float>(flag * j) *
1.234f);
+ origin[6 * k + 5][idx] = std::to_string(static_cast<double>(flag * j)
* 1.234);
+ }
+ }
+ for (size_t i = 0; i < TEST_CASES; i++) {
+ std::vector<std::string> expected(19);
+ std::vector<bool> isNull(19, false);
+ for (size_t j = 1; j <= 18; j++) {
+ expected[j] = origin[j - 1][i];
+ }
+ for (size_t j = 7; j <= 12; j++) {
+ size_t length = j - 6;
+ if (expected[j].size() > length) {
+ isNull[j] = true;
+ } else {
+ expected[j].resize(length, ' ');
+ }
+ }
+ for (size_t j = 13; j <= 18; j++) {
+ size_t length = j - 12;
+ if (expected[j].size() > length) {
+ isNull[j] = true;
+ }
+ }
+#define TEST_COLUMN(index)
\
+ if (isNull[index]) {
\
+ EXPECT_EQ(false, readC##index.notNull[i])
\
+ << i << " " << expected[index] << " "
\
+ << std::string(readC##index.data[i],
static_cast<size_t>(readC##index.length[i])); \
+ } else {
\
+ EXPECT_EQ(true, readC##index.notNull[i]) << i;
\
+ EXPECT_EQ(expected[index],
\
+ std::string(readC##index.data[i],
static_cast<size_t>(readC##index.length[i]))) \
+ << i;
\
+ }
+ TEST_COLUMN(1)
+ TEST_COLUMN(2)
+ TEST_COLUMN(3)
+ TEST_COLUMN(4)
+ TEST_COLUMN(5)
+ TEST_COLUMN(6)
+ TEST_COLUMN(7)
+ TEST_COLUMN(8)
+ TEST_COLUMN(9)
+ TEST_COLUMN(10)
+ TEST_COLUMN(11)
+ TEST_COLUMN(12)
+ TEST_COLUMN(13)
+ TEST_COLUMN(14)
+ TEST_COLUMN(15)
+ TEST_COLUMN(16)
+ TEST_COLUMN(17)
+ TEST_COLUMN(18)
+#undef TEST_COLUMN
+ }
+ }
+
+ // Test conversion from numeric to decimal64/decimal128
+ TEST(ConvertColumnReader, TestConvertNumericToDecimal) {
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
+ constexpr int TEST_CASES = 1024;
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ std::unique_ptr<Type> fileType(
+
Type::buildTypeFromString("struct<c1:bigint,c2:double,c3:bigint,c4:double>"));
+ std::shared_ptr<Type> readType(Type::buildTypeFromString(
+
"struct<c1:decimal(10,2),c2:decimal(10,4),c3:decimal(20,3),c4:decimal(20,3)>"));
+ WriterOptions options;
+ options.setUseTightNumericVector(true);
+ auto writer = createWriter(*fileType, &memStream, options);
+ auto batch = writer->createRowBatch(TEST_CASES);
+ auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ auto& c1 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[0]);
+ auto& c2 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[1]);
+ auto& c3 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[2]);
+ auto& c4 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[3]);
+
+ for (int32_t i = 0; i < TEST_CASES / 2; i++) {
+ size_t idx = static_cast<size_t>(i);
+ c1.data[idx] = i * 12;
+ c3.data[idx] = i * 16;
+ c2.data[idx] = (i % 2 ? -1 : 1) * (i * 1000 + 0.111111 * (i % 9));
+ c4.data[idx] = (i % 2 ? -1 : 1) * (i * 1000 + 0.111111 * (i % 9));
+ }
+ for (int32_t i = TEST_CASES / 2; i < TEST_CASES; i++) {
+ size_t idx = static_cast<size_t>(i);
+ c1.data[idx] = 12345678910LL * i;
+ c2.data[idx] = 12345678910.1234 * i;
+ c3.data[idx] = 12345678910LL * i;
+ c4.data[idx] = (123456.0 * i + (0.1111 * (i % 9))) * (i % 2 ? -1 : 1);
+ }
+ structBatch->numElements = c1.numElements = c2.numElements =
c3.numElements = c4.numElements =
+ TEST_CASES;
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ auto pool = getDefaultPool();
+ auto reader = createReader(*pool, std::move(inStream));
+ RowReaderOptions rowReaderOptions;
+ rowReaderOptions.setUseTightNumericVector(true);
+ rowReaderOptions.setReadType(readType);
+ auto rowReader = reader->createRowReader(rowReaderOptions);
+ auto readBatch = rowReader->createRowBatch(TEST_CASES);
+ EXPECT_EQ(true, rowReader->next(*readBatch));
+
+ auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
+ auto& readC1 =
dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[0]);
+ auto& readC2 =
dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[1]);
+ auto& readC3 =
dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[2]);
+ auto& readC4 =
dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[3]);
+ EXPECT_EQ(TEST_CASES, readBatch->numElements);
+ for (int i = 0; i < TEST_CASES / 2; i++) {
+ size_t idx = static_cast<size_t>(i);
+ EXPECT_TRUE(readC1.notNull[idx]) << i;
+ EXPECT_TRUE(readC2.notNull[idx]) << i;
+ EXPECT_TRUE(readC3.notNull[idx]) << i;
+ EXPECT_TRUE(readC4.notNull[idx]) << i;
+
+ EXPECT_EQ(i * 1200, readC1.values[idx]) << i;
+ EXPECT_EQ(i * 16000, readC3.values[idx].toLong()) << i;
+ EXPECT_EQ((1LL * i * 10000000 + (1111 * (i % 9) + (i % 9 > 4))) * (i % 2
? -1 : 1),
+ readC2.values[idx])
+ << i;
+ EXPECT_EQ((1LL * i * 1000000 + (111 * (i % 9) + (i % 9 > 4))) * (i % 2 ?
-1 : 1),
+ readC4.values[idx].toLong())
+ << i;
+ }
+ for (int i = TEST_CASES / 2; i < TEST_CASES; i++) {
+ size_t idx = static_cast<size_t>(i);
+ EXPECT_FALSE(readC1.notNull[idx]) << i;
+ EXPECT_FALSE(readC2.notNull[idx]) << i;
+ EXPECT_TRUE(readC3.notNull[idx]) << i;
+ EXPECT_TRUE(readC4.notNull[idx]) << i;
+
+ EXPECT_EQ(12345678910000LL * i, readC3.values[idx].toLong()) << i;
+ EXPECT_EQ((123456000LL * i + (111 * (i % 9) + (i % 9 > 4))) * (i % 2 ?
-1 : 1),
+ readC4.values[idx].toLong())
+ << i;
+ }
+ }
+
+ // Test conversion from numeric to timestamp/timestamp with local timezone
+ TEST(ConvertColumnReader, TestConvertNumericToTimestamp) {
+ constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
+ constexpr int TEST_CASES = 1024;
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ std::unique_ptr<Type> fileType(
+
Type::buildTypeFromString("struct<c1:bigint,c2:double,c3:bigint,c4:double>"));
+ std::shared_ptr<Type> readType(
+
Type::buildTypeFromString("struct<c1:timestamp,c2:timestamp,c3:timestamp with
local time "
+ "zone,c4:timestamp with local time zone>"));
+ WriterOptions options;
+ options.setUseTightNumericVector(true);
+ auto writer = createWriter(*fileType, &memStream, options);
+ auto batch = writer->createRowBatch(TEST_CASES);
+ auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ auto& c1 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[0]);
+ auto& c2 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[1]);
+ auto& c3 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[2]);
+ auto& c4 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[3]);
+
+ for (int i = 0; i < TEST_CASES; i++) {
+ size_t idx = static_cast<size_t>(i);
+ c1.data[idx] = (i - TEST_CASES / 2) * 3600 + i;
+ c2.data[idx] = (i - TEST_CASES / 2) * 3600 + i;
+ c3.data[idx] = (i - TEST_CASES / 2) * 3600 + i * i;
+ c4.data[idx] = (i - TEST_CASES / 2) * 3600 + i * i;
+ if (i % 2) {
+ c2.data[idx] += 0.55555;
+ c4.data[idx] += 0.777;
+ } else {
+ c2.data[idx] += 0.11111;
+ c4.data[idx] += 0.333;
+ }
+ }
+
+ structBatch->numElements = c1.numElements = c2.numElements =
c3.numElements = c4.numElements =
+ TEST_CASES;
+ writer->add(*batch);
+ writer->close();
+
+ auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
+ auto pool = getDefaultPool();
+ auto reader = createReader(*pool, std::move(inStream));
+ RowReaderOptions rowReaderOptions;
+ rowReaderOptions.setTimezoneName("Asia/Shanghai");
+ rowReaderOptions.setUseTightNumericVector(true);
+ rowReaderOptions.setReadType(readType);
+ auto rowReader = reader->createRowReader(rowReaderOptions);
+ auto readBatch = rowReader->createRowBatch(TEST_CASES);
+ EXPECT_EQ(true, rowReader->next(*readBatch));
+
+ auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
+ auto& readC1 =
dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[0]);
+ auto& readC2 =
dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[1]);
+ auto& readC3 =
dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[2]);
+ auto& readC4 =
dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[3]);
+ EXPECT_EQ(TEST_CASES, readBatch->numElements);
+ for (int i = 0; i < TEST_CASES; i++) {
+ size_t idx = static_cast<size_t>(i);
+ EXPECT_TRUE(readC1.notNull[idx]) << i;
+ EXPECT_TRUE(readC2.notNull[idx]) << i;
+ EXPECT_TRUE(readC3.notNull[idx]) << i;
+ EXPECT_TRUE(readC4.notNull[idx]) << i;
+
+ EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i - 8 * 3600, readC1.data[idx])
<< i;
+ EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i - 8 * 3600, readC2.data[idx])
<< i;
+ EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i * i, readC3.data[idx]) << i;
+ EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i * i, readC4.data[idx]) << i;
+ EXPECT_EQ(0, readC1.nanoseconds[idx]) << i;
+ EXPECT_EQ(0, readC3.nanoseconds[idx]) << i;
+ if (i % 2) {
+ EXPECT_TRUE(std::abs(555550000 - readC2.nanoseconds[idx]) <= 1) << i;
+ EXPECT_TRUE(std::abs(777000000 - readC4.nanoseconds[idx]) <= 1) << i;
+ } else {
+ EXPECT_TRUE(std::abs(111110000 - readC2.nanoseconds[idx]) <= 1) << i;
+ EXPECT_TRUE(std::abs(333000000 - readC4.nanoseconds[idx]) <= 1) << i;
+ }
+ }
+ }
+
} // namespace orc
diff --git a/c++/test/TestInt128.cc b/c++/test/TestInt128.cc
index ab307662f..ca67522d8 100644
--- a/c++/test/TestInt128.cc
+++ b/c++/test/TestInt128.cc
@@ -643,4 +643,562 @@ namespace orc {
EXPECT_TRUE(Int128(-12340000).toDecimalString(8, true) == "-0.1234");
}
+ TEST(Int128, testConvertDecimalToDifferentPrecisionScale) {
+ // Test convert decimal to different precision/scale
+ Int128 num = Int128(1234567890);
+
+ int fromScale = 5;
+ int toPrecision = 9;
+ int toScale = 5;
+ auto pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true) << pair.second.toString(); // overflow
+
+ fromScale = 5;
+ toPrecision = 9;
+ toScale = 4;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(123456789));
+
+ fromScale = 5;
+ toPrecision = 9;
+ toScale = 3;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 10;
+ toScale = 0;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12346));
+
+ fromScale = 5;
+ toPrecision = 10;
+ toScale = 2;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(1234568)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 10;
+ toScale = 5;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 10;
+ toScale = 6;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true);
+
+ fromScale = 5;
+ toPrecision = 11;
+ toScale = 0;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12346));
+
+ fromScale = 5;
+ toPrecision = 11;
+ toScale = 3;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345679)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 11;
+ toScale = 6;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 11;
+ toScale = 7;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromScale = 5;
+ toPrecision = 12;
+ toScale = 5;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(1234567890)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 12;
+ toScale = 6;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345678900)) << pair.second.toString();
+
+ fromScale = 5;
+ toPrecision = 12;
+ toScale = 8;
+ pair = convertDecimal(num, fromScale, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromScale = 5;
+ toPrecision = 39;
+ toScale = 5;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = 5;
+ toPrecision = 0;
+ toScale = 5;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = 39;
+ toPrecision = 9;
+ toScale = -1;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = 0;
+ toPrecision = 9;
+ toScale = 10;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = -1;
+ toPrecision = 9;
+ toScale = 0;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = 40;
+ toPrecision = 9;
+ toScale = 0;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+
+ fromScale = -40;
+ toPrecision = 9;
+ toScale = 0;
+ EXPECT_THROW(convertDecimal(num, fromScale, toPrecision, toScale),
std::invalid_argument);
+ }
+
+ TEST(Int128, testConvertDecimaFromDouble) {
+ double fromDouble = 12345.13579;
+ int toPrecision = 4;
+ int toScale = 2;
+ auto pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 5;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+ toPrecision = 5;
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 6;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345)) << pair.second.toString();
+
+ toPrecision = 6;
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(123451)) << pair.second.toString();
+
+ toPrecision = 6;
+ toScale = 2;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 8;
+ toScale = 3;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(12345136)) << pair.second.toString();
+
+ fromDouble = -12345.13579;
+
+ toPrecision = 4;
+ toScale = 2;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 5;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(-12345)) << pair.second.toString();
+
+ toPrecision = 5;
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 6;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(-12345)) << pair.second.toString();
+
+ toPrecision = 6;
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(-123451)) << pair.second.toString();
+
+ toPrecision = 6;
+ toScale = 2;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ toPrecision = 8;
+ toScale = 3;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second, Int128(-12345136)) << pair.second.toString();
+
+ fromDouble = pow(10, 37);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), std::to_string(fromDouble).substr(0, 37))
+ << pair.second.toString();
+
+ fromDouble = -pow(10, 37);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), std::to_string(fromDouble).substr(0, 38))
+ << pair.second.toString();
+
+ fromDouble = -std::ldexp(1.0, 126);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), std::to_string(fromDouble).substr(0, 39))
+ << pair.second.toString();
+
+ fromDouble = -std::ldexp(1.0, 127);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = -std::ldexp(1.0, 127);
+ toPrecision = 39;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = std::ldexp(1.0, 127);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = std::ldexp(1.0, 127);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = std::ldexp(1.0, 126);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), std::to_string(fromDouble).substr(0, 38))
+ << pair.second.toString();
+
+ fromDouble = 9988776655443322880.0;
+ toPrecision = 38;
+ toScale = 6;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "9988776655443322880000000") <<
pair.second.toString();
+
+ toScale = 10;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "99887766554433228800000000000") <<
pair.second.toString();
+
+ fromDouble = -9988776655443322880.0;
+ toPrecision = 38;
+ toScale = 6;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "-9988776655443322880000000") <<
pair.second.toString();
+
+ // 1e19
+ fromDouble = 1e19 + 0.5;
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), std::to_string(fromDouble).substr(0, 20))
+ << pair.second.toString();
+
+ toPrecision = 38;
+ toScale = 3;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "10000000000000000000000") <<
pair.second.toString();
+
+ // small than 1<<127 but overflow
+ fromDouble = 1.5e38;
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ // -2^55
+ fromDouble = -std::ldexp(1.0, 55) + 0.5;
+ toScale = 3;
+ toPrecision = 38;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "-36028797018963968000") <<
pair.second.toString();
+
+ // -2^50 - 0.5
+ fromDouble = -std::ldexp(1.0, 50) - 0.5;
+ toScale = 3;
+ toPrecision = 38;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "-1125899906842624500") <<
pair.second.toString();
+
+ fromDouble = std::nan("1");
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = std::numeric_limits<double>::infinity();
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = -std::numeric_limits<double>::infinity();
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = +0.0;
+ toPrecision = 38;
+ toScale = 5;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "0") << pair.second.toString();
+
+ fromDouble = -0.0;
+ toPrecision = 38;
+ toScale = 5;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "0") << pair.second.toString();
+
+ fromDouble = 998244353.998244;
+ toPrecision = 15;
+ toScale = 6;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "998244353998244")
+ << pair.second.toString();
+
+ toScale = 5;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "99824435399824")
+ << pair.second.toString();
+
+ toScale = 2;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "99824435400")
+ << pair.second.toString();
+
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "9982443540")
+ << pair.second.toString();
+
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "998244354")
+ << pair.second.toString();
+
+ toPrecision = 14;
+ toScale = 6;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromDouble = -998244353.998244;
+ toPrecision = 15;
+ toScale = 6;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(),
+ "-998244353998244")
+ << pair.second.toString();
+
+ toScale = 2;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "-99824435400")
+ << pair.second.toString();
+
+ toScale = 1;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "-9982443540")
+ << pair.second.toString();
+
+ toScale = 0;
+ pair = convertDecimal(fromDouble, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromDouble, toPrecision,
toScale).second.toString(), "-998244354")
+ << pair.second.toString();
+ }
+
+ // Test float to decimal conversion like double to decimal conversion.
+ TEST(Int128, testConvertDecimalFromFloat) {
+ std::pair<bool, Int128> pair;
+ float fromFloat;
+ int32_t toPrecision;
+ int32_t toScale;
+
+ fromFloat = +0.0;
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "0") << pair.second.toString();
+
+ fromFloat = -0.0;
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "0") << pair.second.toString();
+
+ fromFloat = std::numeric_limits<float>::infinity();
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromFloat = -std::numeric_limits<float>::infinity();
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ fromFloat = std::nanf("1");
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ // 2^126
+ fromFloat = std::ldexp(1.0f, 126);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "85070591730234615865843651857942052864")
+ << pair.second.toString();
+
+ // 2^127
+ fromFloat = std::ldexp(1.0f, 127);
+ toPrecision = 38;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, true); // overflow
+
+ // 2^70 + 2^69
+ fromFloat = std::ldexp(1.0f, 70) + std::ldexp(1.0f, 69);
+ toPrecision = 38;
+ toScale = 3;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "1770887431076116955136000") <<
pair.second.toString();
+
+ fromFloat = std::ldexp(1.0f, 70) + std::ldexp(1.0f, 60);
+ toPrecision = 38;
+ toScale = 3;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "1181744542222018150400000") <<
pair.second.toString();
+
+ fromFloat = -(std::ldexp(1.0f, 70) + std::ldexp(1.0f, 50));
+ toPrecision = 38;
+ toScale = 3;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "-1180592746617318146048000") <<
pair.second.toString();
+
+ fromFloat = std::ldexp(1.0f, 70) - std::ldexp(1.0f, 60);
+ toPrecision = 38;
+ toScale = 3;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(pair.second.toString(), "1179438699212804456448000") <<
pair.second.toString();
+
+ fromFloat = 9.998244f;
+ toPrecision = 15;
+ toScale = 6;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "9998244")
+ << pair.second.toString();
+
+ toScale = 2;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "1000")
+ << pair.second.toString();
+
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "10")
+ << pair.second.toString();
+
+ fromFloat = -9.998244f;
+ toPrecision = 15;
+ toScale = 6;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "-9998244")
+ << pair.second.toString();
+
+ toScale = 2;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "-1000")
+ << pair.second.toString();
+
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "-10")
+ << pair.second.toString();
+
+ toPrecision = 1;
+ toScale = 0;
+ pair = convertDecimal(fromFloat, toPrecision, toScale);
+ EXPECT_EQ(pair.first, false); // no overflow
+ EXPECT_EQ(convertDecimal(fromFloat, toPrecision,
toScale).second.toString(), "-10")
+ << pair.second.toString();
+ }
+
} // namespace orc
diff --git a/c++/test/TestSchemaEvolution.cc b/c++/test/TestSchemaEvolution.cc
index 9f6f776dc..61169b91d 100644
--- a/c++/test/TestSchemaEvolution.cc
+++ b/c++/test/TestSchemaEvolution.cc
@@ -89,6 +89,30 @@ namespace orc {
}
}
+ // conversion from numeric to string/char/varchar
+ for (size_t i = 0; i <= 6; i++) {
+ for (size_t j = 7; j <= 11; j++) {
+ canConvert[i][j] = true;
+ needConvert[i][j] = true;
+ }
+ }
+
+ // conversion from numeric to decimal
+ for (size_t i = 0; i <= 6; i++) {
+ for (size_t j = 12; j <= 13; j++) {
+ canConvert[i][j] = true;
+ needConvert[i][j] = true;
+ }
+ }
+
+ // conversion from numeric to timestamp
+ for (size_t i = 0; i <= 6; i++) {
+ for (size_t j = 14; j <= 15; j++) {
+ canConvert[i][j] = true;
+ needConvert[i][j] = true;
+ }
+ }
+
for (size_t i = 0; i < typesSize; i++) {
for (size_t j = 0; j < typesSize; j++) {
testConvertReader(types[i], types[j], canConvert[i][j],
needConvert[i][j]);
diff --git a/c++/test/TestTimezone.cc b/c++/test/TestTimezone.cc
index 56e27d532..2386753a4 100644
--- a/c++/test/TestTimezone.cc
+++ b/c++/test/TestTimezone.cc
@@ -367,4 +367,38 @@ namespace orc {
EXPECT_EQ("GMT", getVariantFromZone(*gmt, "1974-01-06 09:59:59"));
EXPECT_EQ("GMT", getVariantFromZone(*gmt, "2015-06-06 12:34:56"));
}
+
+ TEST(TestTimezone, testConvertFromUtc) {
+ const Timezone* la = &getTimezoneByName("America/Los_Angeles");
+ const Timezone* ny = &getTimezoneByName("America/New_York");
+ const Timezone* gmt = &getTimezoneByName("GMT");
+ const Timezone* sh = &getTimezoneByName("Asia/Shanghai");
+ // 2023-05-29 22:20:00 UTC
+ EXPECT_EQ(1685398800 + 7 * 3600, la->convertFromUTC(1685398800)); // it's
PDT
+ EXPECT_EQ(1685398800 + 4 * 3600, ny->convertFromUTC(1685398800)); // it's
EDT
+ EXPECT_EQ(1685398800, gmt->convertFromUTC(1685398800));
+ EXPECT_EQ(1685398800 - 8 * 3600, sh->convertFromUTC(1685398800)); // no
DST in China
+
+ // DST starts in Los Angeles March 12, 2:00 am
+ // 2023-03-12 03:00:00 UTC
+ EXPECT_EQ(1678590000 + 7 * 3600, la->convertFromUTC(1678590000));
+ // 2023-03-12 01:59:59 UTC
+ EXPECT_EQ(1678590000 + 7 * 3600 - 1, la->convertFromUTC(1678586399));
+
+ // DST starts in New York March 12, 2:00 am
+ // 2023-03-12 03:00:00 UTC
+ EXPECT_EQ(1678590000 + 4 * 3600, ny->convertFromUTC(1678590000));
+ // 2023-03-12 01:59:59 UTC
+ EXPECT_EQ(1678590000 + 4 * 3600 - 1, ny->convertFromUTC(1678586399));
+
+ // no DST in China
+ EXPECT_EQ(1678590000 - 8 * 3600, sh->convertFromUTC(1678590000));
+ EXPECT_EQ(1678586399 - 8 * 3600, sh->convertFromUTC(1678586399));
+
+ // DST ends in New York November 5, 2:00 am
+ // 2023-11-05 06:00:00 UTC
+ EXPECT_EQ(1699164000 + 5 * 3600, ny->convertFromUTC(1699164000));
+ // DST ends in Los Angeles November 5, 2:00 am
+ EXPECT_EQ(1699164000 + 8 * 3600, la->convertFromUTC(1699164000));
+ }
} // namespace orc