[ 
https://issues.apache.org/jira/browse/ORC-442?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16706507#comment-16706507
 ] 

ASF GitHub Bot commented on ORC-442:
------------------------------------

majetideepak closed pull request #343: ORC-442: [C++] Code improvements in 
Statistics and Writer.
URL: https://github.com/apache/orc/pull/343
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index 5b894789d8..654956d860 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -34,7 +34,7 @@ namespace orc {
 
     /**
      * Get the number of values in this column. It will differ from the number
-     * of rows because of NULL values and repeated values.
+     * of rows because of NULL values.
      * @return the number of values
      */
     virtual uint64_t getNumberOfValues() const = 0;
diff --git a/c++/src/ColumnWriter.hh b/c++/src/ColumnWriter.hh
index 14fc54a295..cee4e01318 100644
--- a/c++/src/ColumnWriter.hh
+++ b/c++/src/ColumnWriter.hh
@@ -37,7 +37,7 @@ namespace orc {
     /**
      * Get the stream for the given column/kind in this stripe.
      * @param kind the kind of the stream
-     * @return the buffer output stream
+     * @return the buffered output stream
      */
     virtual std::unique_ptr<BufferedOutputStream>
                     createStream(proto::Stream_Kind kind) const = 0;
@@ -98,42 +98,45 @@ namespace orc {
                      uint64_t offset,
                      uint64_t numValues);
     /**
-     * Flush column writer output steams
-     * @param streams vector to store generated stream by flush()
+     * Flush column writer output streams.
+     * @param streams vector to store streams generated by flush()
      */
     virtual void flush(std::vector<proto::Stream>& streams);
 
     /**
-     * Get estimated sized of buffer used
+     * Get estimated size of buffer used.
+     * @return estimated size of buffer used
      */
     virtual uint64_t getEstimatedSize() const;
 
     /**
      * Get the encoding used by the writer for this column.
-     * ColumnEncoding info is pushed into the vector
+     * @param encodings vector to store the returned ColumnEncoding info
      */
     virtual void getColumnEncoding(
       std::vector<proto::ColumnEncoding>& encodings) const = 0;
 
     /**
-     * Get the stripe statistics for this column
+     * Get the stripe statistics for this column.
+     * @param stats vector to store the returned stripe statistics
      */
     virtual void getStripeStatistics(
       std::vector<proto::ColumnStatistics>& stats) const;
 
     /**
-     * Get the file statistics for this column
+     * Get the file statistics for this column.
+     * @param stats vector to store the returned file statistics
      */
     virtual void getFileStatistics(
       std::vector<proto::ColumnStatistics>& stats) const;
 
     /**
-     * Merge index stats into stripe stats and reset index stats
+     * Merge index stats into stripe stats and reset index stats.
      */
     virtual void mergeRowGroupStatsIntoStripeStats();
 
     /**
-     * Merge stripe stats into file stats and reset stripe stats
+     * Merge stripe stats into file stats and reset stripe stats.
      */
     virtual void mergeStripeStatsIntoFileStats();
 
@@ -146,29 +149,29 @@ namespace orc {
     virtual void createRowIndexEntry();
 
     /**
-     * Write row index streams for this column
+     * Write row index streams for this column.
      * @param streams output list of ROW_INDEX streams
      */
     virtual void writeIndex(std::vector<proto::Stream> &streams) const;
 
     /**
-     * Record positions for index
+     * Record positions for index.
      *
-     * This function is called by createRowIndexEntry() and ColumnWrtier's
+     * This function is called by createRowIndexEntry() and ColumnWriter's
      * constructor. So base classes do not need to call inherited classes'
      * recordPosition() function.
      */
     virtual void recordPosition() const;
 
     /**
-     * Reset positions for index
+     * Reset positions for index.
      */
     virtual void reset();
 
   protected:
     /**
      * Utility function to translate ColumnStatistics into protobuf form and
-     * add it to output list
+     * add it to output list.
      * @param statsList output list for protobuf stats
      * @param stats ColumnStatistics to be transformed and added
      */
diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh
index 18f7cfd5ee..ff79377d83 100644
--- a/c++/src/Compression.hh
+++ b/c++/src/Compression.hh
@@ -43,7 +43,7 @@ namespace orc {
    * @param outStream the output stream that is the underlying target
    * @param strategy compression strategy
    * @param bufferCapacity compression stream buffer total capacity
-   * @param compressionBlockSize compresssion buffer block size
+   * @param compressionBlockSize compression buffer block size
    * @param pool the memory pool
    */
   std::unique_ptr<BufferedOutputStream>
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index a72560e27b..645ae31ef6 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -64,7 +64,7 @@ namespace orc {
   }
 
   StatisticsImpl::~StatisticsImpl() {
-    for(std::list<ColumnStatistics*>::iterator ptr = colStats.begin();
+    for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin();
         ptr != colStats.end();
         ++ptr) {
       delete *ptr;
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index d8cc7652ac..0daddfe504 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -167,7 +167,7 @@ namespace orc {
    };
 
   typedef InternalStatisticsImpl<char> InternalCharStatistics;
-  typedef InternalStatisticsImpl<uint64_t> InternalBooleanStatistics;
+  typedef InternalStatisticsImpl<char> InternalBooleanStatistics;
   typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
   typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
   typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
@@ -433,7 +433,7 @@ namespace orc {
                << getFalseCount() << ")" << std::endl;
       } else {
         buffer << "(true: not defined; false: not defined)" << std::endl;
-        buffer << "True and false count are not defined" << std::endl;
+        buffer << "True and false counts are not defined" << std::endl;
       }
       return buffer.str();
     }
@@ -1119,8 +1119,9 @@ namespace orc {
     void update(const char* value, size_t length) {
       if (value != nullptr) {
         if (!_stats.hasMinimum()) {
-          setMinimum(std::string(value, value + length));
-          setMaximum(std::string(value, value + length));
+          std::string tempStr(value, value + length);
+          setMinimum(tempStr);
+          setMaximum(tempStr);
         } else {
           // update min
           int minCmp = strncmp(_stats.getMinimum().c_str(),
@@ -1385,7 +1386,7 @@ namespace orc {
 
   class StatisticsImpl: public Statistics {
   private:
-    std::list<ColumnStatistics*> colStats;
+    std::vector<ColumnStatistics*> colStats;
 
     // DELIBERATELY NOT IMPLEMENTED
     StatisticsImpl(const StatisticsImpl&);
@@ -1399,9 +1400,7 @@ namespace orc {
 
     virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
                                                         ) const override {
-      std::list<ColumnStatistics*>::const_iterator it = colStats.begin();
-      std::advance(it, static_cast<int64_t>(columnId));
-      return *it;
+      return colStats[columnId];
     }
 
     virtual ~StatisticsImpl() override;
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index 9a7a777cc2..d3606f5fe5 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -333,8 +333,9 @@ namespace orc {
 
   void WriterImpl::init() {
     // Write file header
-    outStream->write(WriterImpl::magicId, strlen(WriterImpl::magicId));
-    currentOffset += strlen(WriterImpl::magicId);
+    const static size_t magicIdLength = strlen(WriterImpl::magicId);
+    outStream->write(WriterImpl::magicId, magicIdLength);
+    currentOffset += magicIdLength;
 
     // Initialize file footer
     fileFooter.set_headerlength(currentOffset);


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++] Code improvements in Statistics and Writer
> ------------------------------------------------
>
>                 Key: ORC-442
>                 URL: https://issues.apache.org/jira/browse/ORC-442
>             Project: ORC
>          Issue Type: Improvement
>          Components: C++
>            Reporter: Fang Zheng
>            Assignee: Fang Zheng
>            Priority: Minor
>
> A few code changes in Statistics and Writer classes:
> 1. Change StatisticsImpl to use vector instead of list for storing 
> ColumnStatistics. Because the required operations are push_back() in ctor, 
> iteration in dtor, and random element access in getColumnStatistics(), and 
> list does not support random access in constant time, vector would be more 
> appropriate than list.
> 2.  InternalBooleanStatistics is currently typedef-ed as 
> InternalStatisticsImpl<uint64_t>. Since min/max/sum does not apply to 
> BooleanColumnStatistics, we should define InternalBooleanStatistics to be 
> InternalStatisticsImpl<char> to save 21 bytes per instance.
> 3. Misc. changes to ColumnWriter.hh, Writer.cc, Compression.hh, and 
> Statistics.hh to fix typos in Doxygen and reduce object copies.
> Please see PR for details.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to