[GitHub] orc pull request #149: ORC-224: Implement column writers of primitive types

2017-09-20 Thread jamesclampffer
Github user jamesclampffer commented on a diff in the pull request:

https://github.com/apache/orc/pull/149#discussion_r140077825
  
--- Diff: c++/src/ColumnWriter.cc ---
@@ -468,25 +472,1099 @@ namespace orc {
 rleEncoder->recordPosition(rowIndexPosition.get());
   }
 
-  std::unique_ptr buildWriter(
-const Type& type,
-const StreamsFactory& factory,
-const WriterOptions& options) {
-switch (static_cast(type.getKind())) {
-  case STRUCT:
-return std::unique_ptr(
-  new StructColumnWriter(
- type,
- factory,
- options));
-  case INT:
-  case LONG:
-  case SHORT:
-return std::unique_ptr(
-  new IntegerColumnWriter(
-  type,
-  factory,
-  options));
+  class ByteColumnWriter : public ColumnWriter {
+  public:
+ByteColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues) override;
+
+virtual void flush(std::vector& streams) override;
+
+virtual uint64_t getEstimatedSize() const override;
+
+virtual void getColumnEncoding(
+std::vector& encodings) const override;
+
+virtual void recordPosition() const override;
+
+  private:
+std::unique_ptr byteRleEncoder;
+  };
+
+  ByteColumnWriter::ByteColumnWriter(
+const Type& type,
+const StreamsFactory& factory,
+const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+std::unique_ptr dataStream =
+  
factory.createStream(proto::Stream_Kind_DATA);
+byteRleEncoder = createByteRleEncoder(std::move(dataStream));
+
+if (enableIndex) {
+  recordPosition();
+}
+  }
+
+  void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues) {
+ColumnWriter::add(rowBatch, offset, numValues);
+
+LongVectorBatch& byteBatch =
+   dynamic_cast(rowBatch);
+
+int64_t* data = byteBatch.data.data() + offset;
+const char* notNull = byteBatch.hasNulls ?
+  byteBatch.notNull.data() + offset : nullptr;
+
+char* byteData = reinterpret_cast(data);
+for (uint64_t i = 0; i < numValues; ++i) {
+  byteData[i] = static_cast(data[i]);
+}
+byteRleEncoder->add(byteData, numValues, notNull);
+
+IntegerColumnStatisticsImpl* intStats =
+
dynamic_cast(colIndexStatistics.get());
+bool hasNull = false;
+for (uint64_t i = 0; i < numValues; ++i) {
+  if (notNull == nullptr || notNull[i]) {
+intStats->increase(1);
+intStats->update(static_cast(byteData[i]), 1);
+  } else if (!hasNull) {
+hasNull = true;
+  }
+}
+intStats->setHasNull(hasNull);
+  }
+
+  void ByteColumnWriter::flush(std::vector& streams) {
+ColumnWriter::flush(streams);
+
+proto::Stream stream;
+stream.set_kind(proto::Stream_Kind_DATA);
+stream.set_column(static_cast(columnId));
+stream.set_length(byteRleEncoder->flush());
+streams.push_back(stream);
+  }
+
+  uint64_t ByteColumnWriter::getEstimatedSize() const {
+uint64_t size = ColumnWriter::getEstimatedSize();
+size += byteRleEncoder->getBufferSize();
+return size;
+  }
+
+  void ByteColumnWriter::getColumnEncoding(
+std::vector& encodings) const {
+proto::ColumnEncoding encoding;
+encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+encoding.set_dictionarysize(0);
+encodings.push_back(encoding);
+  }
+
+  void ByteColumnWriter::recordPosition() const {
+ColumnWriter::recordPosition();
+byteRleEncoder->recordPosition(rowIndexPosition.get());
+  }
+
+  class BooleanColumnWriter : public ColumnWriter {
+  public:
+BooleanColumnWriter(const Type& type,
+const StreamsFactory& factory,
+const WriterOptions& 

[GitHub] orc pull request #149: ORC-224: Implement column writers of primitive types

2017-09-20 Thread jamesclampffer
Github user jamesclampffer commented on a diff in the pull request:

https://github.com/apache/orc/pull/149#discussion_r140067969
  
--- Diff: c++/src/ColumnWriter.cc ---
@@ -468,25 +472,1099 @@ namespace orc {
 rleEncoder->recordPosition(rowIndexPosition.get());
   }
 
-  std::unique_ptr buildWriter(
-const Type& type,
-const StreamsFactory& factory,
-const WriterOptions& options) {
-switch (static_cast(type.getKind())) {
-  case STRUCT:
-return std::unique_ptr(
-  new StructColumnWriter(
- type,
- factory,
- options));
-  case INT:
-  case LONG:
-  case SHORT:
-return std::unique_ptr(
-  new IntegerColumnWriter(
-  type,
-  factory,
-  options));
+  class ByteColumnWriter : public ColumnWriter {
+  public:
+ByteColumnWriter(const Type& type,
+ const StreamsFactory& factory,
+ const WriterOptions& options);
+
+virtual void add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues) override;
+
+virtual void flush(std::vector& streams) override;
+
+virtual uint64_t getEstimatedSize() const override;
+
+virtual void getColumnEncoding(
+std::vector& encodings) const override;
+
+virtual void recordPosition() const override;
+
+  private:
+std::unique_ptr byteRleEncoder;
+  };
+
+  ByteColumnWriter::ByteColumnWriter(
+const Type& type,
+const StreamsFactory& factory,
+const WriterOptions& options) :
+ ColumnWriter(type, factory, options) {
+std::unique_ptr dataStream =
+  
factory.createStream(proto::Stream_Kind_DATA);
+byteRleEncoder = createByteRleEncoder(std::move(dataStream));
+
+if (enableIndex) {
+  recordPosition();
+}
+  }
+
+  void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
+ uint64_t offset,
+ uint64_t numValues) {
+ColumnWriter::add(rowBatch, offset, numValues);
+
+LongVectorBatch& byteBatch =
+   dynamic_cast(rowBatch);
+
+int64_t* data = byteBatch.data.data() + offset;
+const char* notNull = byteBatch.hasNulls ?
+  byteBatch.notNull.data() + offset : nullptr;
+
+char* byteData = reinterpret_cast(data);
+for (uint64_t i = 0; i < numValues; ++i) {
+  byteData[i] = static_cast(data[i]);
+}
+byteRleEncoder->add(byteData, numValues, notNull);
+
+IntegerColumnStatisticsImpl* intStats =
+
dynamic_cast(colIndexStatistics.get());
+bool hasNull = false;
+for (uint64_t i = 0; i < numValues; ++i) {
+  if (notNull == nullptr || notNull[i]) {
+intStats->increase(1);
+intStats->update(static_cast(byteData[i]), 1);
+  } else if (!hasNull) {
+hasNull = true;
+  }
+}
+intStats->setHasNull(hasNull);
+  }
+
+  void ByteColumnWriter::flush(std::vector& streams) {
+ColumnWriter::flush(streams);
+
+proto::Stream stream;
+stream.set_kind(proto::Stream_Kind_DATA);
+stream.set_column(static_cast(columnId));
+stream.set_length(byteRleEncoder->flush());
+streams.push_back(stream);
+  }
+
+  uint64_t ByteColumnWriter::getEstimatedSize() const {
+uint64_t size = ColumnWriter::getEstimatedSize();
+size += byteRleEncoder->getBufferSize();
+return size;
+  }
+
+  void ByteColumnWriter::getColumnEncoding(
+std::vector& encodings) const {
+proto::ColumnEncoding encoding;
+encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
+encoding.set_dictionarysize(0);
+encodings.push_back(encoding);
+  }
+
+  void ByteColumnWriter::recordPosition() const {
+ColumnWriter::recordPosition();
+byteRleEncoder->recordPosition(rowIndexPosition.get());
+  }
+
+  class BooleanColumnWriter : public ColumnWriter {
+  public:
+BooleanColumnWriter(const Type& type,
+const StreamsFactory& factory,
+const WriterOptions& 

[GitHub] orc issue #151: ORC-226 Support getWriterId in c++ reader interface

2017-09-20 Thread xndai
Github user xndai commented on the issue:

https://github.com/apache/orc/pull/151
  
Squash commit. Thanks @ajayyadava @majetideepak 


---


[GitHub] orc issue #134: Orc 17

2017-09-20 Thread AnatoliShein
Github user AnatoliShein commented on the issue:

https://github.com/apache/orc/pull/134
  
Now ORC can be built without LIBHDFSPP like this:
cmake -DBUILD_LIBHDFSPP=off ..


---


[GitHub] orc pull request #161: ORC-210: Add encoding for Double, Float.

2017-09-20 Thread sundapeng
Github user sundapeng commented on a diff in the pull request:

https://github.com/apache/orc/pull/161#discussion_r139888584
  
--- Diff: java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java ---
@@ -688,17 +700,23 @@ protected void skipRows(long items) throws 
IOException {
   }
 
   public static class DoubleTreeReader extends TreeReader {
-protected InStream stream;
-private final SerializationUtils utils;
+private DoubleReader reader;
--- End diff --

Hi @pudidic , thank you for the patch!
I found `reader` should be `protected` here for Hive integration. 
[EncodedTreeReaderFactory](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/io/orc/encoded/EncodedTreeReaderFactory.java#L950)
And since we would remove a `protected` variable, could we reuse the 
`stream`?


---