This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3bff2c93ff ARROW-18421: [C++][ORC] Add accessor for stripe information
in reader (#14806)
3bff2c93ff is described below
commit 3bff2c93ff180433ca3150011188a4bad7374833
Author: LouisClt <[email protected]>
AuthorDate: Mon Dec 12 17:56:14 2022 +0100
ARROW-18421: [C++][ORC] Add accessor for stripe information in reader
(#14806)
See https://issues.apache.org/jira/browse/ARROW-18421
Lead-authored-by: LouisClt <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/adapters/orc/adapter.cc | 33 ++++++++++++++++--------------
cpp/src/arrow/adapters/orc/adapter.h | 15 ++++++++++++++
cpp/src/arrow/adapters/orc/adapter_test.cc | 4 ++++
3 files changed, 37 insertions(+), 15 deletions(-)
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc
b/cpp/src/arrow/adapters/orc/adapter.cc
index e691a21f41..d4e379a93b 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -126,13 +126,6 @@ class ArrowInputFile : public liborc::InputStream {
std::shared_ptr<io::RandomAccessFile> file_;
};
-struct StripeInformation {
- uint64_t offset;
- uint64_t length;
- uint64_t num_rows;
- uint64_t first_row_of_stripe;
-};
-
// The number of rows to read in a ColumnVectorBatch
constexpr int64_t kReadRowsBatch = 1000;
@@ -206,8 +199,10 @@ class ORCFileReader::Impl {
uint64_t first_row_of_stripe = 0;
for (int i = 0; i < nstripes; ++i) {
stripe = reader_->getStripe(i);
- stripes_[i] = StripeInformation({stripe->getOffset(),
stripe->getLength(),
- stripe->getNumberOfRows(),
first_row_of_stripe});
+ stripes_[i] =
StripeInformation({static_cast<int64_t>(stripe->getOffset()),
+
static_cast<int64_t>(stripe->getLength()),
+
static_cast<int64_t>(stripe->getNumberOfRows()),
+
static_cast<int64_t>(first_row_of_stripe)});
first_row_of_stripe += stripe->getNumberOfRows();
}
return Status::OK();
@@ -217,6 +212,8 @@ class ORCFileReader::Impl {
int64_t NumberOfRows() { return
static_cast<int64_t>(reader_->getNumberOfRows()); }
+ StripeInformation GetStripeInformation(int64_t stripe) { return
stripes_[stripe]; }
+
FileVersion GetFileVersion() {
liborc::FileVersion orc_file_version = reader_->getFormatVersion();
return FileVersion(orc_file_version.getMajor(),
orc_file_version.getMinor());
@@ -383,7 +380,8 @@ class ORCFileReader::Impl {
ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
Status::Invalid("Out of bounds stripe: ", stripe));
- opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+ opts->range(static_cast<uint64_t>(stripes_[stripe].offset),
+ static_cast<uint64_t>(stripes_[stripe].length));
return Status::OK();
}
@@ -393,9 +391,9 @@ class ORCFileReader::Impl {
Status::Invalid("Out of bounds row number: ", row_number));
for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
- if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
- static_cast<uint64_t>(row_number) < it->first_row_of_stripe +
it->num_rows) {
- opts->range(it->offset, it->length);
+ if (row_number >= it->first_row_id &&
+ row_number < it->first_row_id + it->num_rows) {
+ opts->range(static_cast<uint64_t>(it->offset),
static_cast<uint64_t>(it->length));
*out = *it;
return Status::OK();
}
@@ -427,7 +425,8 @@ class ORCFileReader::Impl {
liborc::RowReaderOptions opts(row_opts);
std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
- opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+ opts.range(static_cast<uint64_t>(stripes_[stripe].offset),
+ static_cast<uint64_t>(stripes_[stripe].length));
ARROW_ASSIGN_OR_RAISE(batches[stripe],
ReadBatch(opts, schema,
stripes_[stripe].num_rows));
}
@@ -488,7 +487,7 @@ class ORCFileReader::Impl {
ORC_BEGIN_CATCH_NOT_OK
row_reader = reader_->createRowReader(opts);
row_reader->seekToRow(current_row_);
- current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
ORC_END_CATCH_NOT_OK
return std::make_shared<OrcStripeReader>(std::move(row_reader), schema,
batch_size,
@@ -600,6 +599,10 @@ int64_t ORCFileReader::NumberOfStripes() { return
impl_->NumberOfStripes(); }
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+StripeInformation ORCFileReader::GetStripeInformation(int64_t stripe) {
+ return impl_->GetStripeInformation(stripe);
+}
+
FileVersion ORCFileReader::GetFileVersion() { return impl_->GetFileVersion(); }
std::string ORCFileReader::GetSoftwareVersion() { return
impl_->GetSoftwareVersion(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h
b/cpp/src/arrow/adapters/orc/adapter.h
index af7e2ff77c..013be78600 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -35,6 +35,18 @@ namespace arrow {
namespace adapters {
namespace orc {
+/// \brief Information about an ORC stripe
+struct StripeInformation {
+ /// \brief Offset of the stripe from the start of the file, in bytes
+ int64_t offset;
+ /// \brief Length of the stripe, in bytes
+ int64_t length;
+ /// \brief Number of rows in the stripe
+ int64_t num_rows;
+ /// \brief Index of the first row of the stripe
+ int64_t first_row_id;
+};
+
/// \class ORCFileReader
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
class ARROW_EXPORT ORCFileReader {
@@ -168,6 +180,9 @@ class ARROW_EXPORT ORCFileReader {
/// \brief The number of rows in the file
int64_t NumberOfRows();
+ /// \brief StripeInformation for each stripe.
+ StripeInformation GetStripeInformation(int64_t stripe);
+
/// \brief Get the format version of the file.
/// Currently known values are 0.11 and 0.12.
///
diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc
b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 320c71992f..c119e5cbeb 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -392,6 +392,10 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes)
{
ASSERT_TRUE(metadata->Equals(*expected_metadata));
ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows());
ASSERT_EQ(stripe_count, reader->NumberOfStripes());
+ ASSERT_EQ(static_cast<int64_t>(stripe_row_count),
+ reader->GetStripeInformation(0).num_rows);
+ ASSERT_EQ(static_cast<int64_t>(reader->NumberOfRows() - stripe_row_count),
+ reader->GetStripeInformation(stripe_count - 1).first_row_id);
accumulated = 0;
EXPECT_OK_AND_ASSIGN(auto stripe_reader,
reader->NextStripeReader(reader_batch_size));
while (stripe_reader) {