This is an automated email from the ASF dual-hosted git repository.
kakachen pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/orc by this push:
new a7d1ba70678 [enhancement](orc) improve the read amplification problem
caused by orc tiny stripe optimization. (#313)
a7d1ba70678 is described below
commit a7d1ba70678786d841f0db96e9374437ba5d2ae4
Author: daidai <[email protected]>
AuthorDate: Thu May 15 15:57:35 2025 +0800
[enhancement](orc) improve the read amplification problem caused by orc
tiny stripe optimization. (#313)
---
c++/include/orc/Reader.hh | 2 ++
c++/src/Reader.cc | 31 ++++++++++++++++++-------------
c++/src/Reader.hh | 2 ++
3 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index c9be47e0d8b..e3abb143b4a 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -639,6 +639,8 @@ namespace orc {
virtual void setStream(std::unique_ptr<InputStream>) = 0;
virtual std::vector<int> getNeedReadStripes(const RowReaderOptions& opts)
= 0;
+
+ virtual void getSelectedColumns(const std::list<std::string>& names,
std::vector<bool>& selectedColumns) = 0;
};
/**
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 0f4cf4d740c..4c0144da89e 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -1041,21 +1041,26 @@ namespace orc {
return getMemoryUse(stripeIx, selectedColumns);
}
+ void ReaderImpl::getSelectedColumns(const std::list<std::string>& names,
std::vector<bool>& selectedColumns) {
+ selectedColumns.clear();
+
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()),
false);
+ ColumnSelector column_selector(contents.get());
+ if (contents->schema->getKind() == STRUCT && names.begin() !=
names.end()) {
+ for (std::list<std::string>::const_iterator field = names.begin();
field != names.end();
+ ++field) {
+ column_selector.updateSelectedByName(selectedColumns, *field);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ column_selector.selectParents(selectedColumns, *contents->schema.get());
+ selectedColumns[0] = true; // column 0 is selected by default
+ }
+
uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names,
int stripeIx) {
std::vector<bool> selectedColumns;
-
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()),
false);
- ColumnSelector column_selector(contents.get());
- if (contents->schema->getKind() == STRUCT && names.begin() != names.end())
{
- for (std::list<std::string>::const_iterator field = names.begin(); field
!= names.end();
- ++field) {
- column_selector.updateSelectedByName(selectedColumns, *field);
- }
- } else {
- // default is to select all columns
- std::fill(selectedColumns.begin(), selectedColumns.end(), true);
- }
- column_selector.selectParents(selectedColumns, *contents->schema.get());
- selectedColumns[0] = true; // column 0 is selected by default
+ getSelectedColumns(names, selectedColumns);
return getMemoryUse(stripeIx, selectedColumns);
}
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 7ec049ad963..1fd429be86a 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -439,6 +439,8 @@ namespace orc {
std::map<uint32_t, BloomFilterIndex> getBloomFilters(
uint32_t stripeIndex, const std::set<uint32_t>& included) const
override;
+
+ void getSelectedColumns(const std::list<std::string>& names,
std::vector<bool>& selectedColumns) override;
};
} // namespace orc
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]