wgtmac commented on code in PR #2048:
URL: https://github.com/apache/orc/pull/2048#discussion_r1851600106
##########
c++/include/orc/Reader.hh:
##########
@@ -40,6 +40,17 @@ namespace orc {
struct ReaderOptionsPrivate;
struct RowReaderOptionsPrivate;
+ struct CacheOptions {
+ /// The maximum distance in bytes between two consecutive
+ /// ranges; beyond this value, ranges are not combined
Review Comment:
```suggestion
// The maximum distance in bytes between two consecutive
// ranges; beyond this value, ranges are not combined
```
##########
c++/include/orc/OrcFile.hh:
##########
@@ -36,6 +37,9 @@ namespace orc {
*/
class InputStream {
public:
+ using Buffer = DataBuffer<char>;
+ using BufferPtr = std::shared_ptr<Buffer>;
+
Review Comment:
```suggestion
```
##########
c++/include/orc/Reader.hh:
##########
@@ -624,6 +647,21 @@ namespace orc {
*/
virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const =
0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
Review Comment:
Please add the expectation when it is called multiple times w/ or w/o
overlapping ranges. Also it is good to mention that it is thread safe.
##########
c++/src/Options.hh:
##########
@@ -19,10 +19,12 @@
#ifndef ORC_OPTIONS_HH
#define ORC_OPTIONS_HH
+#include "io/Cache.hh"
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
+#include <iostream>
Review Comment:
```suggestion
```
##########
c++/src/Reader.hh:
##########
@@ -28,6 +28,7 @@
#include "RLE.hh"
#include "SchemaEvolution.hh"
#include "TypeImpl.hh"
+#include "io/Cache.hh"
Review Comment:
Move it before `#include "RLE.hh"` to sort alphabetically.
##########
c++/include/orc/Reader.hh:
##########
@@ -624,6 +647,21 @@ namespace orc {
*/
virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const =
0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ * @param options the cache options for prefetched contents
+ */
+ virtual void preBuffer(const std::vector<int>& stripes,
+ const std::list<uint64_t>& includeTypes) = 0;
+
+ /**
+ * Release cache entries whose boundary is less than the given value.
Review Comment:
What if the boundary is half way in a coalesced chunk? It might be good to
document this as well.
##########
c++/src/StripeStream.cc:
##########
@@ -37,7 +38,8 @@ namespace orc {
stripeStart_(stripeStart),
input_(input),
writerTimezone_(writerTimezone),
- readerTimezone_(readerTimezone) {
+ readerTimezone_(readerTimezone),
+ readCache_(reader.getReadCache()) {
Review Comment:
You might directly call `RowReaderImpl.getFileContents()` to get readCache
as suggested above.
##########
c++/include/orc/OrcFile.hh:
##########
@@ -58,6 +62,18 @@ namespace orc {
*/
virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
+ /**
+ * Read data asynchronously into the buffer. The buffer is allocated by
the caller.
+ * @param buf the buffer to read into
+ * @param length the number of bytes to read.
+ * @param offset the position in the stream to read from.
+ * @return a future that will be set to the number of bytes read when the
read is complete.
Review Comment:
```suggestion
* @return a future that will be set when the read is complete.
```
##########
c++/src/StripeStream.cc:
##########
@@ -19,6 +19,7 @@
#include "StripeStream.hh"
#include "RLE.hh"
#include "Reader.hh"
+#include "io/Cache.hh"
Review Comment:
Please sort alphabetically.
##########
c++/include/orc/Reader.hh:
##########
@@ -40,6 +40,17 @@ namespace orc {
struct ReaderOptionsPrivate;
struct RowReaderOptionsPrivate;
+ struct CacheOptions {
+ /// The maximum distance in bytes between two consecutive
+ /// ranges; beyond this value, ranges are not combined
+ uint64_t holeSizeLimit = 8192;
+
+ /// The maximum size in bytes of a combined range; if
+ /// combining two consecutive ranges would produce a range of a
+ /// size greater than this, they are not combined
Review Comment:
```suggestion
// The maximum size in bytes of a combined range; if
// combining two consecutive ranges would produce a range of a
// size greater than this, they are not combined
```
##########
c++/test/TestReader.cc:
##########
@@ -105,6 +105,15 @@ namespace orc {
nextSkippedRows));
}
+ void prefetchAllStripes(Reader* reader) {
+ auto num_stripes = reader->getNumberOfStripes();
+ std::vector<int> stripes;
+ for (size_t i = 0; i < num_stripes; ++i) {
+ stripes.push_back(i);
+ }
+ reader->preBuffer(stripes, {0});
Review Comment:
Please add test case where preBuffer is called for multiple times and with
different stripe/column, etc.
##########
c++/src/Reader.hh:
##########
@@ -218,7 +221,8 @@ namespace orc {
* @param contents of the file
* @param options options for reading
*/
- RowReaderImpl(std::shared_ptr<FileContents> contents, const
RowReaderOptions& options);
+ RowReaderImpl(std::shared_ptr<FileContents> contents, const
RowReaderOptions& options,
Review Comment:
It seems that `FileContents` is a perfect home for readCache. WDYT? cc @ffacs
##########
c++/src/io/Cache.hh:
##########
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <orc/MemoryPool.hh>
+#include <orc/OrcFile.hh>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace orc {
+ class InputStream;
Review Comment:
```suggestion
```
##########
c++/src/Options.hh:
##########
@@ -19,10 +19,12 @@
#ifndef ORC_OPTIONS_HH
#define ORC_OPTIONS_HH
+#include "io/Cache.hh"
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
Review Comment:
#include "orc/Int128.hh"
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
#include "io/Cache.hh"
They should be in a order which the public headers are on the top.
##########
c++/src/io/Cache.hh:
##########
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <orc/MemoryPool.hh>
+#include <orc/OrcFile.hh>
Review Comment:
```suggestion
#include "orc/MemoryPool.hh"
#include "orc/OrcFile.hh"
```
##########
c++/include/orc/Reader.hh:
##########
@@ -624,6 +647,21 @@ namespace orc {
*/
virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const =
0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ * @param options the cache options for prefetched contents
+ */
+ virtual void preBuffer(const std::vector<int>& stripes,
+ const std::list<uint64_t>& includeTypes) = 0;
+
+ /**
+ * Release cache entries whose boundary is less than the given value.
Review Comment:
ditto
##########
c++/src/Reader.hh:
##########
@@ -260,6 +268,13 @@ namespace orc {
// footer
proto::Footer* footer_;
uint64_t numberOfStripes_;
+
+ // cached io ranges. only valid when preBuffer is invoked.
+ std::shared_ptr<ReadRangeCache> readCache_;
+
+ // mutex to protect readCache_ from concurrent access
+ std::mutex readCacheMutex_;
Review Comment:
Can we move the mutex to be a member variable of ReadRangeCache?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]