[
https://issues.apache.org/jira/browse/ORC-443?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16707641#comment-16707641
]
ASF GitHub Bot commented on ORC-443:
------------------------------------
wgtmac closed pull request #344: ORC-443: [C++] Code improvements in
ColumnWriter.
URL: https://github.com/apache/orc/pull/344
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc
index 2664019f67..4bf8b7a598 100644
--- a/c++/src/ByteRLE.cc
+++ b/c++/src/ByteRLE.cc
@@ -40,7 +40,7 @@ namespace orc {
virtual ~ByteRleEncoderImpl() override;
/**
- * Encode the next batch of values
+ * Encode the next batch of values.
* @param data to be encoded
* @param numValues the number of values to be encoded
* @param notNull If the pointer is null, all values are read. If the
@@ -55,8 +55,8 @@ namespace orc {
virtual uint64_t getBufferSize() const override;
/**
- * Flushing underlying BufferedOutputStream
- */
+ * Flush underlying BufferedOutputStream.
+ */
virtual uint64_t flush() override;
virtual void recordPosition(PositionRecorder* recorder) const override;
@@ -122,7 +122,7 @@ namespace orc {
writeByte(
static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
writeByte(literals[0]);
- } else {
+ } else {
writeByte(static_cast<char>(-numLiterals));
for (int i = 0; i < numLiterals; ++i) {
writeByte(literals[i]);
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index c6c30f3833..ef31c45cbd 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -274,13 +274,13 @@ namespace orc {
ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const StructVectorBatch* structBatch =
dynamic_cast<const StructVectorBatch *>(&rowBatch);
if (structBatch == nullptr) {
throw InvalidArgument("Failed to cast to StructVectorBatch");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->add(*structBatch->fields[i], offset, numValues);
}
@@ -289,15 +289,17 @@ namespace orc {
if (!structBatch->hasNulls) {
colIndexStatistics->increase(numValues);
} else {
+ uint64_t count = 0;
bool hasNull = false;
const char* notNull = structBatch->notNull.data() + offset;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- colIndexStatistics->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
+ colIndexStatistics->increase(count);
if (hasNull) {
colIndexStatistics->setHasNull(true);
}
@@ -445,13 +447,18 @@ namespace orc {
ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
const LongVectorBatch* longBatch =
dynamic_cast<const LongVectorBatch*>(&rowBatch);
if (longBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
+ IntegerColumnStatisticsImpl* intStats =
+ dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (intStats == nullptr) {
+ throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues);
const int64_t* data = longBatch->data.data() + offset;
const char* notNull = longBatch->hasNulls ?
@@ -460,21 +467,17 @@ namespace orc {
rleEncoder->add(data, numValues, notNull);
// update stats
- IntegerColumnStatisticsImpl* intStats =
- dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
- if (intStats == nullptr) {
- throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
- }
-
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
- intStats->increase(1);
+ ++count;
intStats->update(data[i], 1);
} else if (!hasNull) {
hasNull = true;
}
}
+ intStats->increase(count);
if (hasNull) {
intStats->setHasNull(true);
}
@@ -549,12 +552,17 @@ namespace orc {
void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
if (byteBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
+ IntegerColumnStatisticsImpl* intStats =
+ dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (intStats == nullptr) {
+ throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues);
int64_t* data = byteBatch->data.data() + offset;
const char* notNull = byteBatch->hasNulls ?
@@ -566,20 +574,17 @@ namespace orc {
}
byteRleEncoder->add(byteData, numValues, notNull);
- IntegerColumnStatisticsImpl* intStats =
- dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get());
- if (intStats == nullptr) {
- throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl");
- }
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
- intStats->increase(1);
+ ++count;
intStats->update(static_cast<int64_t>(byteData[i]), 1);
} else if (!hasNull) {
hasNull = true;
}
}
+ intStats->increase(count);
if (hasNull) {
intStats->setHasNull(true);
}
@@ -654,12 +659,18 @@ namespace orc {
void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
if (byteBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
+ BooleanColumnStatisticsImpl* boolStats =
+ dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (boolStats == nullptr) {
+ throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues);
+
int64_t* data = byteBatch->data.data() + offset;
const char* notNull = byteBatch->hasNulls ?
byteBatch->notNull.data() + offset : nullptr;
@@ -670,20 +681,17 @@ namespace orc {
}
rleEncoder->add(byteData, numValues, notNull);
- BooleanColumnStatisticsImpl* boolStats =
- dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get());
- if (boolStats == nullptr) {
- throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl");
- }
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
- boolStats->increase(1);
+ ++count;
boolStats->update(byteData[i] != 0, 1);
} else if (!hasNull) {
hasNull = true;
}
}
+ boolStats->increase(count);
if (hasNull) {
boolStats->setHasNull(true);
}
@@ -774,25 +782,26 @@ namespace orc {
void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const DoubleVectorBatch* dblBatch =
dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
if (dblBatch == nullptr) {
throw InvalidArgument("Failed to cast to DoubleVectorBatch");
}
-
- const double* doubleData = dblBatch->data.data() + offset;
- const char* notNull = dblBatch->hasNulls ?
- dblBatch->notNull.data() + offset : nullptr;
-
DoubleColumnStatisticsImpl* doubleStats =
dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
if (doubleStats == nullptr) {
throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ const double* doubleData = dblBatch->data.data() + offset;
+ const char* notNull = dblBatch->hasNulls ?
+ dblBatch->notNull.data() + offset : nullptr;
+
size_t bytes = isFloat ? 4 : 8;
char* data = buffer.data();
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -802,13 +811,13 @@ namespace orc {
encodeFloatNum<double, int64_t>(doubleData[i], data);
}
dataStream->write(data, bytes);
-
- doubleStats->increase(1);
+ ++count;
doubleStats->update(doubleData[i]);
} else if (!hasNull) {
hasNull = true;
}
}
+ doubleStats->increase(count);
if (hasNull) {
doubleStats->setHasNull(true);
}
@@ -1071,13 +1080,20 @@ namespace orc {
void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const StringVectorBatch* stringBatch =
dynamic_cast<const StringVectorBatch*>(&rowBatch);
if (stringBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
+ StringColumnStatisticsImpl* strStats =
+ dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (strStats == nullptr) {
+ throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues);
+
char *const * data = stringBatch->data.data() + offset;
const int64_t* length = stringBatch->length.data() + offset;
const char* notNull = stringBatch->hasNulls ?
@@ -1087,26 +1103,24 @@ namespace orc {
directLengthEncoder->add(length, numValues, notNull);
}
- StringColumnStatisticsImpl* strStats =
- dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
- if (strStats == nullptr) {
- throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
- }
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
+ const size_t len = static_cast<size_t>(length[i]);
if (useDictionary) {
- size_t index = dictionary.insert(data[i],
static_cast<size_t>(length[i]));
+ size_t index = dictionary.insert(data[i], len);
dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index));
} else {
- directDataStream->write(data[i], static_cast<size_t>(length[i]));
+ directDataStream->write(data[i], len);
}
- strStats->update(data[i], static_cast<size_t>(length[i]));
- strStats->increase(1);
+ strStats->update(data[i], len);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
+ strStats->increase(count);
if (hasNull) {
strStats->setHasNull(true);
}
@@ -1432,23 +1446,25 @@ namespace orc {
void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
StringVectorBatch* charsBatch =
dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
- char** data = charsBatch->data.data() + offset;
- int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
-
StringColumnStatisticsImpl* strStats =
dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
if (strStats == nullptr) {
throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ char** data = charsBatch->data.data() + offset;
+ int64_t* length = charsBatch->length.data() + offset;
+ const char* notNull = charsBatch->hasNulls ?
+ charsBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -1477,7 +1493,7 @@ namespace orc {
}
strStats->update(charData, static_cast<size_t>(length[i]));
- strStats->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
@@ -1487,6 +1503,7 @@ namespace orc {
directLengthEncoder->add(length, numValues, notNull);
}
+ strStats->increase(count);
if (hasNull) {
strStats->setHasNull(true);
}
@@ -1513,23 +1530,25 @@ namespace orc {
void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
StringVectorBatch* charsBatch =
dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
- char* const* data = charsBatch->data.data() + offset;
- int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
-
StringColumnStatisticsImpl* strStats =
dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get());
if (strStats == nullptr) {
throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ char* const* data = charsBatch->data.data() + offset;
+ int64_t* length = charsBatch->length.data() + offset;
+ const char* notNull = charsBatch->hasNulls ?
+ charsBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -1545,7 +1564,7 @@ namespace orc {
}
strStats->update(data[i], static_cast<size_t>(length[i]));
- strStats->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
@@ -1555,6 +1574,7 @@ namespace orc {
directLengthEncoder->add(length, numValues, notNull);
}
+ strStats->increase(count);
if (hasNull) {
strStats->setHasNull(true);
}
@@ -1577,16 +1597,10 @@ namespace orc {
void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (binBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
- char** data = binBatch->data.data() + offset;
- int64_t* length = binBatch->length.data() + offset;
- const char* notNull = binBatch->hasNulls ?
- binBatch->notNull.data() + offset : nullptr;
BinaryColumnStatisticsImpl* binStats =
dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get());
@@ -1594,6 +1608,14 @@ namespace orc {
throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ char** data = binBatch->data.data() + offset;
+ int64_t* length = binBatch->length.data() + offset;
+ const char* notNull = binBatch->hasNulls ?
+ binBatch->notNull.data() + offset : nullptr;
+
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
uint64_t unsignedLength = static_cast<uint64_t>(length[i]);
@@ -1601,12 +1623,13 @@ namespace orc {
directDataStream->write(data[i], unsignedLength);
binStats->update(unsignedLength);
- binStats->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
directLengthEncoder->add(length, numValues, notNull);
+ binStats->increase(count);
if (hasNull) {
binStats->setHasNull(true);
}
@@ -1689,29 +1712,32 @@ namespace orc {
void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
TimestampVectorBatch* tsBatch =
dynamic_cast<TimestampVectorBatch*>(&rowBatch);
if (tsBatch == nullptr) {
throw InvalidArgument("Failed to cast to TimestampVectorBatch");
}
- const char* notNull = tsBatch->hasNulls ?
- tsBatch->notNull.data() + offset : nullptr;
- int64_t *secs = tsBatch->data.data() + offset;
- int64_t *nanos = tsBatch->nanoseconds.data() + offset;
-
TimestampColumnStatisticsImpl* tsStats =
dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get());
if (tsStats == nullptr) {
throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl");
}
+
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ const char* notNull = tsBatch->hasNulls ?
+ tsBatch->notNull.data() + offset : nullptr;
+ int64_t *secs = tsBatch->data.data() + offset;
+ int64_t *nanos = tsBatch->nanoseconds.data() + offset;
+
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i]) {
// TimestampVectorBatch already stores data in UTC
int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000;
- tsStats->increase(1);
+ ++count;
tsStats->update(millsUTC);
if (secs[i] < 0 && nanos[i] != 0) {
@@ -1724,6 +1750,7 @@ namespace orc {
hasNull = true;
}
}
+ tsStats->increase(count);
if (hasNull) {
tsStats->setHasNull(true);
}
@@ -1791,33 +1818,37 @@ namespace orc {
void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const LongVectorBatch* longBatch =
dynamic_cast<const LongVectorBatch*>(&rowBatch);
if (longBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
+ DateColumnStatisticsImpl* dateStats =
+ dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
+ if (dateStats == nullptr) {
+ throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
+ }
+
+ ColumnWriter::add(rowBatch, offset, numValues);
+
const int64_t* data = longBatch->data.data() + offset;
const char* notNull = longBatch->hasNulls ?
longBatch->notNull.data() + offset : nullptr;
rleEncoder->add(data, numValues, notNull);
- DateColumnStatisticsImpl* dateStats =
- dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get());
- if (dateStats == nullptr) {
- throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl");
- }
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- dateStats->increase(1);
+ ++count;
dateStats->update(static_cast<int32_t>(data[i]));
} else if (!hasNull) {
hasNull = true;
}
}
+ dateStats->increase(count);
if (hasNull) {
dateStats->setHasNull(true);
}
@@ -1882,22 +1913,25 @@ namespace orc {
void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const Decimal64VectorBatch* decBatch =
dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
}
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
- const int64_t* values = decBatch->values.data() + offset;
DecimalColumnStatisticsImpl* decStats =
dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ const char* notNull = decBatch->hasNulls ?
+ decBatch->notNull.data() + offset : nullptr;
+ const int64_t* values = decBatch->values.data() + offset;
+
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -1914,13 +1948,13 @@ namespace orc {
}
}
valueStream->write(buffer, static_cast<size_t>(data - buffer));
-
- decStats->increase(1);
+ ++count;
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
} else if (!hasNull) {
hasNull = true;
}
}
+ decStats->increase(count);
if (hasNull) {
decStats->setHasNull(true);
}
@@ -2003,24 +2037,27 @@ namespace orc {
void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
const Decimal128VectorBatch* decBatch =
dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
}
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
- const Int128* values = decBatch->values.data() + offset;
DecimalColumnStatisticsImpl* decStats =
dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
+ const char* notNull = decBatch->hasNulls ?
+ decBatch->notNull.data() + offset : nullptr;
+ const Int128* values = decBatch->values.data() + offset;
+
// The current encoding of decimal columns stores the integer
representation
// of the value as an unbounded length zigzag encoded base 128 varint.
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -2037,12 +2074,13 @@ namespace orc {
}
valueStream->write(buffer, static_cast<size_t>(data - buffer));
- decStats->increase(1);
+ ++count;
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
} else if (!hasNull) {
hasNull = true;
}
}
+ decStats->increase(count);
if (hasNull) {
decStats->setHasNull(true);
}
@@ -2125,13 +2163,13 @@ namespace orc {
void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
if (listBatch == nullptr) {
throw InvalidArgument("Failed to cast to ListVectorBatch");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
int64_t* offsets = listBatch->offsets.data() + offset;
const char* notNull = listBatch->hasNulls ?
listBatch->notNull.data() + offset : nullptr;
@@ -2154,14 +2192,16 @@ namespace orc {
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- colIndexStatistics->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
+ colIndexStatistics->increase(count);
if (hasNull) {
colIndexStatistics->setHasNull(true);
}
@@ -2344,13 +2384,13 @@ namespace orc {
void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
if (mapBatch == nullptr) {
throw InvalidArgument("Failed to cast to MapVectorBatch");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
int64_t* offsets = mapBatch->offsets.data() + offset;
const char* notNull = mapBatch->hasNulls ?
mapBatch->notNull.data() + offset : nullptr;
@@ -2377,14 +2417,16 @@ namespace orc {
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- colIndexStatistics->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
+ colIndexStatistics->increase(count);
if (hasNull) {
colIndexStatistics->setHasNull(true);
}
@@ -2595,13 +2637,13 @@ namespace orc {
void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
uint64_t offset,
uint64_t numValues) {
- ColumnWriter::add(rowBatch, offset, numValues);
-
UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
if (unionBatch == nullptr) {
throw InvalidArgument("Failed to cast to UnionVectorBatch");
}
+ ColumnWriter::add(rowBatch, offset, numValues);
+
const char* notNull = unionBatch->hasNulls ?
unionBatch->notNull.data() + offset : nullptr;
unsigned char * tags = unionBatch->tags.data() + offset;
@@ -2632,14 +2674,16 @@ namespace orc {
if (!notNull) {
colIndexStatistics->increase(numValues);
} else {
+ uint64_t count = 0;
bool hasNull = false;
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- colIndexStatistics->increase(1);
+ ++count;
} else if (!hasNull) {
hasNull = true;
}
}
+ colIndexStatistics->increase(count);
if (hasNull) {
colIndexStatistics->setHasNull(true);
}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++] Code improvements in ColumnWriter
> ---------------------------------------
>
> Key: ORC-443
> URL: https://issues.apache.org/jira/browse/ORC-443
> Project: ORC
> Issue Type: Improvement
> Components: C++
> Reporter: Fang Zheng
> Assignee: Fang Zheng
> Priority: Minor
>
> A few changes to ColumnWriter and its derived classes:
> 1. in add() function, re-order code to verify input parameters before
> modifying any internal state.
> 2. in add() function, move the calls to colIndexStatistics->increase(1) out
> of the loop. Many of those are virtual function calls.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)