wgtmac commented on code in PR #2048:
URL: https://github.com/apache/orc/pull/2048#discussion_r1861426712
##########
c++/include/orc/Reader.hh:
##########
@@ -624,6 +647,24 @@ namespace orc {
*/
virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const =
0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * It is thread safe. Users should make sure requested stripes and columns
+ * are not overlapped, otherwise the overlapping part will be prefetched
multiple time,
+ * which doesn't affect correctness but waste IO and memory resources.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ * @param options the cache options for prefetched contents
+ */
+ virtual void preBuffer(const std::vector<int>& stripes,
Review Comment:
```suggestion
virtual void preBuffer(const std::vector<uint32_t>& stripes,
```
Please use explicit integer type.
##########
c++/src/io/Cache.hh:
##########
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace orc {
+
+ struct ReadRange {
+ uint64_t offset;
+ uint64_t length;
+
+ ReadRange() = default;
+ ReadRange(uint64_t offset, uint64_t length) : offset(offset),
length(length) {}
+
+ friend bool operator==(const ReadRange& left, const ReadRange& right) {
+ return (left.offset == right.offset && left.length == right.length);
+ }
+ friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+ return !(left == right);
+ }
+
+ bool contains(const ReadRange& other) const {
+ return (offset <= other.offset && offset + length >= other.offset +
other.length);
+ }
+ };
+
+ struct ReadRangeCombiner {
+ const uint64_t holeSizeLimit;
+ const uint64_t rangeSizeLimit;
+
+ std::vector<ReadRange> coalesce(std::vector<ReadRange> ranges) const;
+
+ static std::vector<ReadRange> coalesceReadRanges(std::vector<ReadRange>
ranges,
+ uint64_t holeSizeLimit,
+ uint64_t rangeSizeLimit);
+ };
+
+ using Buffer = DataBuffer<char>;
+ using BufferPtr = std::shared_ptr<Buffer>;
+
+ struct RangeCacheEntry {
+ ReadRange range;
+
+ // The result may be get multiple times, so we use shared_future instead
of std::future
+ BufferPtr buffer;
+ std::shared_future<void> future;
Review Comment:
```suggestion
ReadRange range;
BufferPtr buffer;
std::shared_future<void> future; // use shared_future in case of
multiple get calls
```
##########
c++/test/TestReader.cc:
##########
@@ -218,7 +239,9 @@ namespace orc {
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(),
memStream.getLength());
ReaderOptions readerOptions;
readerOptions.setMemoryPool(*pool);
- return createReader(std::move(inStream), readerOptions);
+ auto reader = createReader(std::move(inStream), readerOptions);
+ prefetchStripesAndColumnsRandomly(reader.get(), *type);
Review Comment:
Could you write new test cases for prefetching instead of mixing the
existing cases? Perhaps it requires some refactoring work to share the common
code. In addition, I'd prefer clear parameters instead of (pseudo) random ones.
##########
c++/include/orc/Reader.hh:
##########
@@ -624,6 +647,24 @@ namespace orc {
*/
virtual std::map<uint32_t, RowGroupIndex> getRowGroupIndex(
uint32_t stripeIndex, const std::set<uint32_t>& included = {}) const =
0;
+
+ /**
+ * Trigger IO prefetch and cache the prefetched contents asynchronously.
+ * It is thread safe. Users should make sure requested stripes and columns
+ * are not overlapped, otherwise the overlapping part will be prefetched
multiple time,
+ * which doesn't affect correctness but waste IO and memory resources.
+ * @param stripes the stripes to prefetch
+ * @param includeTypes the types to prefetch
+ * @param options the cache options for prefetched contents
Review Comment:
```suggestion
```
##########
c++/src/io/Cache.hh:
##########
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <future>
+#include <utility>
+#include <vector>
+
+namespace orc {
+
+ struct ReadRange {
+ uint64_t offset;
+ uint64_t length;
+
+ ReadRange() = default;
+ ReadRange(uint64_t offset, uint64_t length) : offset(offset),
length(length) {}
+
+ friend bool operator==(const ReadRange& left, const ReadRange& right) {
+ return (left.offset == right.offset && left.length == right.length);
+ }
+ friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+ return !(left == right);
+ }
+
+ bool contains(const ReadRange& other) const {
+ return (offset <= other.offset && offset + length >= other.offset +
other.length);
+ }
+ };
+
+ struct ReadRangeCombiner {
+ const uint64_t holeSizeLimit;
+ const uint64_t rangeSizeLimit;
+
+ std::vector<ReadRange> coalesce(std::vector<ReadRange> ranges) const;
+
+ static std::vector<ReadRange> coalesceReadRanges(std::vector<ReadRange>
ranges,
+ uint64_t holeSizeLimit,
+ uint64_t rangeSizeLimit);
+ };
+
+ using Buffer = DataBuffer<char>;
+ using BufferPtr = std::shared_ptr<Buffer>;
+
+ struct RangeCacheEntry {
+ ReadRange range;
+
+ // The result may be get multiple times, so we use shared_future instead
of std::future
+ BufferPtr buffer;
+ std::shared_future<void> future;
+
+ RangeCacheEntry() = default;
+ RangeCacheEntry(const ReadRange& range, BufferPtr buffer,
std::future<void> future)
+ : range(range), buffer(std::move(buffer)),
future(std::move(future).share()) {}
+
+ friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry&
right) {
+ return left.range.offset < right.range.offset;
+ }
+ };
+
+ struct BufferSlice {
+ BufferSlice() : buffer(nullptr), offset(0), length(0) {}
+
+ BufferSlice(BufferPtr buffer, uint64_t offset, uint64_t length)
+ : buffer(std::move(buffer)), offset(offset), length(length) {}
+
+ BufferPtr buffer;
+ uint64_t offset;
+ uint64_t length;
+ };
Review Comment:
```suggestion
struct BufferSlice {
BufferPtr buffer = nullptr;
uint64_t offset = 0;
uint64_t length = 0;
};
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]