wgtmac commented on code in PR #2048:
URL: https://github.com/apache/orc/pull/2048#discussion_r1808970604


##########
c++/include/orc/OrcFile.hh:
##########
@@ -153,4 +179,4 @@ namespace orc {
                                        const WriterOptions& options);
 }  // namespace orc
 
-#endif
+#endif

Review Comment:
   Please avoid unintentional change.



##########
c++/src/MemoryPool.cc:
##########
@@ -86,10 +86,6 @@ namespace orc {
       for (uint64_t i = currentSize_; i > newSize; --i) {
         (buf_ + i - 1)->~T();
       }
-    } else if (newSize > currentSize_) {
-      for (uint64_t i = currentSize_; i < newSize; ++i) {
-        new (buf_ + i) T();

Review Comment:
   I don't think we can remove these lines.



##########
c++/include/orc/OrcFile.hh:
##########
@@ -36,6 +37,20 @@ namespace orc {
    */
   class InputStream {
    public:
+    using Buffer = DataBuffer<char>;
+    using BufferPtr = std::shared_ptr<Buffer>;
+
+    struct BufferSlice {

Review Comment:
   If `BufferSlice ` is only used internally, please do not expose it in the 
public header.



##########
c++/include/orc/Reader.hh:
##########
@@ -605,6 +612,26 @@ namespace orc {
      */
     virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters(
         uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
+
+    /**
+     * Get the input stream for the ORC file.
+     */
+    virtual InputStream* getStream() const = 0;
+
+    /**
+     * Get the footer of the ORC file.
+     */
+    virtual const proto::Footer* getFooter() const = 0;

Review Comment:
   We cannot expose the internal protobuf structure.



##########
c++/src/StripeStream.hh:
##########
@@ -45,6 +46,7 @@ namespace orc {
     InputStream& input_;
     const Timezone& writerTimezone_;
     const Timezone& readerTimezone_;
+    std::shared_ptr<ReadRangeCache> cachedSource;

Review Comment:
   ```suggestion
       std::shared_ptr<ReadRangeCache> readCache_;
   ```
   
   We don't need to copy the name `source` from Arrow.



##########
c++/src/MemoryPool.cc:
##########
@@ -134,9 +130,6 @@ namespace orc {
   template <>
   void DataBuffer<char>::resize(uint64_t newSize) {
     reserve(newSize);
-    if (newSize > currentSize_) {

Review Comment:
   Lines below seem to be unrelated changes. Could you please revert them? I 
remember there was an effort to remove these but caused issues to the 
downstream.



##########
c++/src/StripeStream.hh:
##########
@@ -19,6 +19,7 @@
 #ifndef ORC_STRIPE_STREAM_HH
 #define ORC_STRIPE_STREAM_HH
 
+#include "io/Cache.hh"

Review Comment:
   What about using forward declaration of `ReadRangeCache` instead of adding 
the header file?



##########
c++/include/orc/OrcFile.hh:
##########
@@ -58,6 +73,17 @@ namespace orc {
      */
     virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
 
+    /**
+     * Read data asynchronously.
+     * @param offset the position in the stream to read from.
+     * @param length the number of bytes to read.
+     * @return a future that will be set to the buffer when the read is 
complete.
+     */
+    virtual std::future<BufferPtr> readAsync(uint64_t /*offset*/, uint64_t 
/*length*/,

Review Comment:
   Should we match the signature of `read` function above, which uses an 
externally provided buffer?



##########
c++/src/Reader.hh:
##########
@@ -19,6 +19,7 @@
 #ifndef ORC_READER_IMPL_HH
 #define ORC_READER_IMPL_HH
 
+#include "io/Cache.hh"

Review Comment:
   Move it below line 28.



##########
c++/src/io/Cache.hh:
##########
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <orc/MemoryPool.hh>
+#include <orc/OrcFile.hh>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace orc {
+  class InputStream;
+
+  struct CacheOptions {
+    /// The maximum distance in bytes between two consecutive
+    /// ranges; beyond this value, ranges are not combined
+    uint64_t holeSizeLimit = 8192;
+
+    /// The maximum size in bytes of a combined range; if
+    /// combining two consecutive ranges would produce a range of a
+    /// size greater than this, they are not combined
+    uint64_t rangeSizeLimit = 32 * 1024 * 1024;
+  };
+
+  struct ReadRange {
+    uint64_t offset;
+    uint64_t length;
+
+    ReadRange() = default;
+    ReadRange(uint64_t offset, uint64_t length) : offset(offset), 
length(length) {}
+
+    friend bool operator==(const ReadRange& left, const ReadRange& right) {
+      return (left.offset == right.offset && left.length == right.length);
+    }
+    friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+      return !(left == right);
+    }
+
+    bool contains(const ReadRange& other) const {
+      return (offset <= other.offset && offset + length >= other.offset + 
other.length);
+    }
+  };
+
+  struct ReadRangeCombiner {
+    std::vector<ReadRange> coalesce(std::vector<ReadRange> ranges) const {

Review Comment:
   Move the implementation detail to the cc file?



##########
c++/include/orc/Reader.hh:
##########
@@ -605,6 +612,26 @@ namespace orc {
      */
     virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters(
         uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
+
+    /**
+     * Get the input stream for the ORC file.
+     */
+    virtual InputStream* getStream() const = 0;
+
+    /**
+     * Get the footer of the ORC file.
+     */
+    virtual const proto::Footer* getFooter() const = 0;
+
+    /**
+     * Get the schema of the ORC file.
+     */
+    virtual const proto::Metadata* getMetadata() const = 0;
+
+    virtual void preBuffer(const std::vector<int>& stripes, const 
std::list<uint64_t>& includeTypes,

Review Comment:
   Is it possible to call these functions internally instead of user 
intervention? Reader has the knowledge of selected stripes (from offset/length 
reader options) and selected columns.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to