This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/orc by this push:
     new a7d1ba70678 [enhancement](orc) improve the read amplification problem 
caused by orc tiny stripe optimization. (#313)
a7d1ba70678 is described below

commit a7d1ba70678786d841f0db96e9374437ba5d2ae4
Author: daidai <[email protected]>
AuthorDate: Thu May 15 15:57:35 2025 +0800

    [enhancement](orc) improve the read amplification problem caused by orc 
tiny stripe optimization. (#313)
---
 c++/include/orc/Reader.hh |  2 ++
 c++/src/Reader.cc         | 31 ++++++++++++++++++-------------
 c++/src/Reader.hh         |  2 ++
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index c9be47e0d8b..e3abb143b4a 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -639,6 +639,8 @@ namespace orc {
     virtual void setStream(std::unique_ptr<InputStream>) = 0;
 
     virtual std::vector<int> getNeedReadStripes(const RowReaderOptions& opts) 
= 0;
+
+    virtual void getSelectedColumns(const std::list<std::string>& names, 
std::vector<bool>& selectedColumns) = 0;
   };
 
   /**
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 0f4cf4d740c..4c0144da89e 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -1041,21 +1041,26 @@ namespace orc {
     return getMemoryUse(stripeIx, selectedColumns);
   }
 
+  void ReaderImpl::getSelectedColumns(const std::list<std::string>& names, 
std::vector<bool>& selectedColumns) {
+      selectedColumns.clear();
+      
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), 
false);
+      ColumnSelector column_selector(contents.get());
+      if (contents->schema->getKind() == STRUCT && names.begin() != 
names.end()) {
+          for (std::list<std::string>::const_iterator field = names.begin(); 
field != names.end();
+               ++field) {
+              column_selector.updateSelectedByName(selectedColumns, *field);
+          }
+      } else {
+          // default is to select all columns
+          std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+      }
+      column_selector.selectParents(selectedColumns, *contents->schema.get());
+      selectedColumns[0] = true;  // column 0 is selected by default
+  }
+
   uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, 
int stripeIx) {
     std::vector<bool> selectedColumns;
-    
selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), 
false);
-    ColumnSelector column_selector(contents.get());
-    if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) 
{
-      for (std::list<std::string>::const_iterator field = names.begin(); field 
!= names.end();
-           ++field) {
-        column_selector.updateSelectedByName(selectedColumns, *field);
-      }
-    } else {
-      // default is to select all columns
-      std::fill(selectedColumns.begin(), selectedColumns.end(), true);
-    }
-    column_selector.selectParents(selectedColumns, *contents->schema.get());
-    selectedColumns[0] = true;  // column 0 is selected by default
+    getSelectedColumns(names, selectedColumns);
     return getMemoryUse(stripeIx, selectedColumns);
   }
 
diff --git a/c++/src/Reader.hh b/c++/src/Reader.hh
index 7ec049ad963..1fd429be86a 100644
--- a/c++/src/Reader.hh
+++ b/c++/src/Reader.hh
@@ -439,6 +439,8 @@ namespace orc {
 
     std::map<uint32_t, BloomFilterIndex> getBloomFilters(
         uint32_t stripeIndex, const std::set<uint32_t>& included) const 
override;
+
+    void getSelectedColumns(const std::list<std::string>& names, 
std::vector<bool>& selectedColumns) override;
   };
 }  // namespace orc
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to