[GitHub] orc pull request #149: ORC-224: Implement column writers of primitive types
Github user jamesclampffer commented on a diff in the pull request: https://github.com/apache/orc/pull/149#discussion_r140077825 --- Diff: c++/src/ColumnWriter.cc --- @@ -468,25 +472,1099 @@ namespace orc { rleEncoder->recordPosition(rowIndexPosition.get()); } - std::unique_ptr buildWriter( -const Type& type, -const StreamsFactory& factory, -const WriterOptions& options) { -switch (static_cast(type.getKind())) { - case STRUCT: -return std::unique_ptr( - new StructColumnWriter( - type, - factory, - options)); - case INT: - case LONG: - case SHORT: -return std::unique_ptr( - new IntegerColumnWriter( - type, - factory, - options)); + class ByteColumnWriter : public ColumnWriter { + public: +ByteColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + +virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues) override; + +virtual void flush(std::vector& streams) override; + +virtual uint64_t getEstimatedSize() const override; + +virtual void getColumnEncoding( +std::vector& encodings) const override; + +virtual void recordPosition() const override; + + private: +std::unique_ptr byteRleEncoder; + }; + + ByteColumnWriter::ByteColumnWriter( +const Type& type, +const StreamsFactory& factory, +const WriterOptions& options) : + ColumnWriter(type, factory, options) { +std::unique_ptr dataStream = + factory.createStream(proto::Stream_Kind_DATA); +byteRleEncoder = createByteRleEncoder(std::move(dataStream)); + +if (enableIndex) { + recordPosition(); +} + } + + void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues) { +ColumnWriter::add(rowBatch, offset, numValues); + +LongVectorBatch& byteBatch = + dynamic_cast(rowBatch); + +int64_t* data = byteBatch.data.data() + offset; +const char* notNull = byteBatch.hasNulls ? + byteBatch.notNull.data() + offset : nullptr; + +char* byteData = reinterpret_cast (data); +for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast(data[i]); +} +byteRleEncoder->add(byteData, numValues, notNull); + +IntegerColumnStatisticsImpl* intStats = + dynamic_cast (colIndexStatistics.get()); +bool hasNull = false; +for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { +intStats->increase(1); +intStats->update(static_cast(byteData[i]), 1); + } else if (!hasNull) { +hasNull = true; + } +} +intStats->setHasNull(hasNull); + } + + void ByteColumnWriter::flush(std::vector& streams) { +ColumnWriter::flush(streams); + +proto::Stream stream; +stream.set_kind(proto::Stream_Kind_DATA); +stream.set_column(static_cast(columnId)); +stream.set_length(byteRleEncoder->flush()); +streams.push_back(stream); + } + + uint64_t ByteColumnWriter::getEstimatedSize() const { +uint64_t size = ColumnWriter::getEstimatedSize(); +size += byteRleEncoder->getBufferSize(); +return size; + } + + void ByteColumnWriter::getColumnEncoding( +std::vector& encodings) const { +proto::ColumnEncoding encoding; +encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); +encoding.set_dictionarysize(0); +encodings.push_back(encoding); + } + + void ByteColumnWriter::recordPosition() const { +ColumnWriter::recordPosition(); +byteRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class BooleanColumnWriter : public ColumnWriter { + public: +BooleanColumnWriter(const Type& type, +const StreamsFactory& factory, +const WriterOptions&
[GitHub] orc pull request #149: ORC-224: Implement column writers of primitive types
Github user jamesclampffer commented on a diff in the pull request: https://github.com/apache/orc/pull/149#discussion_r140067969 --- Diff: c++/src/ColumnWriter.cc --- @@ -468,25 +472,1099 @@ namespace orc { rleEncoder->recordPosition(rowIndexPosition.get()); } - std::unique_ptr buildWriter( -const Type& type, -const StreamsFactory& factory, -const WriterOptions& options) { -switch (static_cast(type.getKind())) { - case STRUCT: -return std::unique_ptr( - new StructColumnWriter( - type, - factory, - options)); - case INT: - case LONG: - case SHORT: -return std::unique_ptr( - new IntegerColumnWriter( - type, - factory, - options)); + class ByteColumnWriter : public ColumnWriter { + public: +ByteColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + +virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues) override; + +virtual void flush(std::vector& streams) override; + +virtual uint64_t getEstimatedSize() const override; + +virtual void getColumnEncoding( +std::vector& encodings) const override; + +virtual void recordPosition() const override; + + private: +std::unique_ptr byteRleEncoder; + }; + + ByteColumnWriter::ByteColumnWriter( +const Type& type, +const StreamsFactory& factory, +const WriterOptions& options) : + ColumnWriter(type, factory, options) { +std::unique_ptr dataStream = + factory.createStream(proto::Stream_Kind_DATA); +byteRleEncoder = createByteRleEncoder(std::move(dataStream)); + +if (enableIndex) { + recordPosition(); +} + } + + void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues) { +ColumnWriter::add(rowBatch, offset, numValues); + +LongVectorBatch& byteBatch = + dynamic_cast(rowBatch); + +int64_t* data = byteBatch.data.data() + offset; +const char* notNull = byteBatch.hasNulls ? + byteBatch.notNull.data() + offset : nullptr; + +char* byteData = reinterpret_cast (data); +for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast(data[i]); +} +byteRleEncoder->add(byteData, numValues, notNull); + +IntegerColumnStatisticsImpl* intStats = + dynamic_cast (colIndexStatistics.get()); +bool hasNull = false; +for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { +intStats->increase(1); +intStats->update(static_cast(byteData[i]), 1); + } else if (!hasNull) { +hasNull = true; + } +} +intStats->setHasNull(hasNull); + } + + void ByteColumnWriter::flush(std::vector& streams) { +ColumnWriter::flush(streams); + +proto::Stream stream; +stream.set_kind(proto::Stream_Kind_DATA); +stream.set_column(static_cast(columnId)); +stream.set_length(byteRleEncoder->flush()); +streams.push_back(stream); + } + + uint64_t ByteColumnWriter::getEstimatedSize() const { +uint64_t size = ColumnWriter::getEstimatedSize(); +size += byteRleEncoder->getBufferSize(); +return size; + } + + void ByteColumnWriter::getColumnEncoding( +std::vector& encodings) const { +proto::ColumnEncoding encoding; +encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); +encoding.set_dictionarysize(0); +encodings.push_back(encoding); + } + + void ByteColumnWriter::recordPosition() const { +ColumnWriter::recordPosition(); +byteRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class BooleanColumnWriter : public ColumnWriter { + public: +BooleanColumnWriter(const Type& type, +const StreamsFactory& factory, +const WriterOptions&
[GitHub] orc issue #151: ORC-226 Support getWriterId in c++ reader interface
Github user xndai commented on the issue: https://github.com/apache/orc/pull/151 Squash commit. Thanks @ajayyadava @majetideepak ---
[GitHub] orc issue #134: Orc 17
Github user AnatoliShein commented on the issue: https://github.com/apache/orc/pull/134 Now ORC can be built without LIBHDFSPP like this: cmake -DBUILD_LIBHDFSPP=off .. ---
[GitHub] orc pull request #161: ORC-210: Add encoding for Double, Float.
Github user sundapeng commented on a diff in the pull request: https://github.com/apache/orc/pull/161#discussion_r139888584 --- Diff: java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java --- @@ -688,17 +700,23 @@ protected void skipRows(long items) throws IOException { } public static class DoubleTreeReader extends TreeReader { -protected InStream stream; -private final SerializationUtils utils; +private DoubleReader reader; --- End diff -- Hi @pudidic , thank you for the patch! I found `reader` should be `protected` here for Hive integration. [EncodedTreeReaderFactory](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java#L950) And since we would remove a `protected` variable, could we reuse the `stream`? ---