[GitHub] [arrow] westonpace commented on a diff in pull request #14226: ARROW-17599: [C++] Change the way how arrow reads parquet buffered files

GitBox Mon, 26 Sep 2022 23:01:31 -0700


westonpace commented on code in PR #14226:
URL: https://github.com/apache/arrow/pull/14226#discussion_r980781470



##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -1168,19 +1168,26 @@ 
FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
   if (rows_to_readahead < 0) {
     return Status::Invalid("rows_to_readahead must be > 0");
   }
-  if (reader_properties_.pre_buffer()) {
-    BEGIN_PARQUET_CATCH_EXCEPTIONS
-    reader_->PreBuffer(row_group_indices, column_indices, 
reader_properties_.io_context(),
-                       reader_properties_.cache_options());
-    END_PARQUET_CATCH_EXCEPTIONS
-  }
-  ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> 
row_group_generator =
-      
RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
-                        cpu_executor, row_group_indices, column_indices,
-                        rows_to_readahead);
+  std::vector<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>> 
vec;
+  for (const int& row_group_index: row_group_indices) {
+    if (reader_properties_.pre_buffer()) {

Review Comment:
   Here we are calling `PreBuffer` multiple times.  Each time we call it we 
haven't yet finished reading from the time before.  Will this work?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -2407,6 +2407,66 @@ TEST(TestArrowReadWrite, GetRecordBatchReaderNoColumns) {
   ASSERT_EQ(actual_batch->num_rows(), num_rows);
 }
 
+TEST(TestArrowReadWrite, GetRecordBatchGenerator2) {

Review Comment:
   Do you have a test case where we measure that we don't actually keep the RAM 
when reading the entire file?



##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -1168,19 +1168,26 @@ 
FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
   if (rows_to_readahead < 0) {
     return Status::Invalid("rows_to_readahead must be > 0");
   }
-  if (reader_properties_.pre_buffer()) {
-    BEGIN_PARQUET_CATCH_EXCEPTIONS
-    reader_->PreBuffer(row_group_indices, column_indices, 
reader_properties_.io_context(),
-                       reader_properties_.cache_options());
-    END_PARQUET_CATCH_EXCEPTIONS
-  }
-  ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> 
row_group_generator =
-      
RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
-                        cpu_executor, row_group_indices, column_indices,
-                        rows_to_readahead);
+  std::vector<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>> 
vec;
+  for (const int& row_group_index: row_group_indices) {
+    if (reader_properties_.pre_buffer()) {
+      BEGIN_PARQUET_CATCH_EXCEPTIONS
+          reader_->PreBuffer({row_group_index}, column_indices, 
reader_properties_.io_context(),
+                         reader_properties_.cache_options());
+      END_PARQUET_CATCH_EXCEPTIONS
+    }
+    ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> 
row_group_generator =
+        
RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
+                          cpu_executor, {row_group_index}, column_indices,
+                          rows_to_readahead);
+    ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>> 
concatenated =
+        ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
+    WRAP_ASYNC_GENERATOR(std::move(concatenated));
+    vec.push_back(std::move(concatenated));
+  }
+  auto vec_gen = ::arrow::MakeVectorGenerator(std::move(vec));
   ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>> concatenated =
-      ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
-  WRAP_ASYNC_GENERATOR(std::move(concatenated));
+      ::arrow::MakeConcatenatedGenerator(std::move(vec_gen));

Review Comment:
   This seems a bit over-complicated.  Can we push the extra calls to 
`PreBuffer` down a layer into the `RowGroupGenerator` itself?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] westonpace commented on a diff in pull request #14226: ARROW-17599: [C++] Change the way how arrow reads parquet buffered files

Reply via email to