This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3bff2c93ff ARROW-18421: [C++][ORC] Add accessor for stripe information 
in reader (#14806)
3bff2c93ff is described below

commit 3bff2c93ff180433ca3150011188a4bad7374833
Author: LouisClt <[email protected]>
AuthorDate: Mon Dec 12 17:56:14 2022 +0100

    ARROW-18421: [C++][ORC] Add accessor for stripe information in reader 
(#14806)
    
    See https://issues.apache.org/jira/browse/ARROW-18421
    
    Lead-authored-by: LouisClt <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/adapters/orc/adapter.cc      | 33 ++++++++++++++++--------------
 cpp/src/arrow/adapters/orc/adapter.h       | 15 ++++++++++++++
 cpp/src/arrow/adapters/orc/adapter_test.cc |  4 ++++
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/adapters/orc/adapter.cc 
b/cpp/src/arrow/adapters/orc/adapter.cc
index e691a21f41..d4e379a93b 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -126,13 +126,6 @@ class ArrowInputFile : public liborc::InputStream {
   std::shared_ptr<io::RandomAccessFile> file_;
 };
 
-struct StripeInformation {
-  uint64_t offset;
-  uint64_t length;
-  uint64_t num_rows;
-  uint64_t first_row_of_stripe;
-};
-
 // The number of rows to read in a ColumnVectorBatch
 constexpr int64_t kReadRowsBatch = 1000;
 
@@ -206,8 +199,10 @@ class ORCFileReader::Impl {
     uint64_t first_row_of_stripe = 0;
     for (int i = 0; i < nstripes; ++i) {
       stripe = reader_->getStripe(i);
-      stripes_[i] = StripeInformation({stripe->getOffset(), 
stripe->getLength(),
-                                       stripe->getNumberOfRows(), 
first_row_of_stripe});
+      stripes_[i] = 
StripeInformation({static_cast<int64_t>(stripe->getOffset()),
+                                       
static_cast<int64_t>(stripe->getLength()),
+                                       
static_cast<int64_t>(stripe->getNumberOfRows()),
+                                       
static_cast<int64_t>(first_row_of_stripe)});
       first_row_of_stripe += stripe->getNumberOfRows();
     }
     return Status::OK();
@@ -217,6 +212,8 @@ class ORCFileReader::Impl {
 
   int64_t NumberOfRows() { return 
static_cast<int64_t>(reader_->getNumberOfRows()); }
 
+  StripeInformation GetStripeInformation(int64_t stripe) { return 
stripes_[stripe]; }
+
   FileVersion GetFileVersion() {
     liborc::FileVersion orc_file_version = reader_->getFormatVersion();
     return FileVersion(orc_file_version.getMajor(), 
orc_file_version.getMinor());
@@ -383,7 +380,8 @@ class ORCFileReader::Impl {
     ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
                     Status::Invalid("Out of bounds stripe: ", stripe));
 
-    opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+    opts->range(static_cast<uint64_t>(stripes_[stripe].offset),
+                static_cast<uint64_t>(stripes_[stripe].length));
     return Status::OK();
   }
 
@@ -393,9 +391,9 @@ class ORCFileReader::Impl {
                     Status::Invalid("Out of bounds row number: ", row_number));
 
     for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
-      if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
-          static_cast<uint64_t>(row_number) < it->first_row_of_stripe + 
it->num_rows) {
-        opts->range(it->offset, it->length);
+      if (row_number >= it->first_row_id &&
+          row_number < it->first_row_id + it->num_rows) {
+        opts->range(static_cast<uint64_t>(it->offset), 
static_cast<uint64_t>(it->length));
         *out = *it;
         return Status::OK();
       }
@@ -427,7 +425,8 @@ class ORCFileReader::Impl {
     liborc::RowReaderOptions opts(row_opts);
     std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
     for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
-      opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+      opts.range(static_cast<uint64_t>(stripes_[stripe].offset),
+                 static_cast<uint64_t>(stripes_[stripe].length));
       ARROW_ASSIGN_OR_RAISE(batches[stripe],
                             ReadBatch(opts, schema, 
stripes_[stripe].num_rows));
     }
@@ -488,7 +487,7 @@ class ORCFileReader::Impl {
     ORC_BEGIN_CATCH_NOT_OK
     row_reader = reader_->createRowReader(opts);
     row_reader->seekToRow(current_row_);
-    current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+    current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
     ORC_END_CATCH_NOT_OK
 
     return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, 
batch_size,
@@ -600,6 +599,10 @@ int64_t ORCFileReader::NumberOfStripes() { return 
impl_->NumberOfStripes(); }
 
 int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
 
+StripeInformation ORCFileReader::GetStripeInformation(int64_t stripe) {
+  return impl_->GetStripeInformation(stripe);
+}
+
 FileVersion ORCFileReader::GetFileVersion() { return impl_->GetFileVersion(); }
 
 std::string ORCFileReader::GetSoftwareVersion() { return 
impl_->GetSoftwareVersion(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h 
b/cpp/src/arrow/adapters/orc/adapter.h
index af7e2ff77c..013be78600 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -35,6 +35,18 @@ namespace arrow {
 namespace adapters {
 namespace orc {
 
+/// \brief Information about an ORC stripe
+struct StripeInformation {
+  /// \brief Offset of the stripe from the start of the file, in bytes
+  int64_t offset;
+  /// \brief Length of the stripe, in bytes
+  int64_t length;
+  /// \brief Number of rows in the stripe
+  int64_t num_rows;
+  /// \brief Index of the first row of the stripe
+  int64_t first_row_id;
+};
+
 /// \class ORCFileReader
 /// \brief Read an Arrow Table or RecordBatch from an ORC file.
 class ARROW_EXPORT ORCFileReader {
@@ -168,6 +180,9 @@ class ARROW_EXPORT ORCFileReader {
   /// \brief The number of rows in the file
   int64_t NumberOfRows();
 
+  /// \brief StripeInformation for each stripe.
+  StripeInformation GetStripeInformation(int64_t stripe);
+
   /// \brief Get the format version of the file.
   ///         Currently known values are 0.11 and 0.12.
   ///
diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc 
b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 320c71992f..c119e5cbeb 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -392,6 +392,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) 
{
   ASSERT_TRUE(metadata->Equals(*expected_metadata));
   ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows());
   ASSERT_EQ(stripe_count, reader->NumberOfStripes());
+  ASSERT_EQ(static_cast<int64_t>(stripe_row_count),
+            reader->GetStripeInformation(0).num_rows);
+  ASSERT_EQ(static_cast<int64_t>(reader->NumberOfRows() - stripe_row_count),
+            reader->GetStripeInformation(stripe_count - 1).first_row_id);
   accumulated = 0;
   EXPECT_OK_AND_ASSIGN(auto stripe_reader, 
reader->NextStripeReader(reader_batch_size));
   while (stripe_reader) {

Reply via email to