Re: [I] [C++] ReadNext in arrow::RecordBatchReader returns invalid status on second or subsequent items [arrow]

via GitHub Wed, 24 Apr 2024 06:14:54 -0700


mapleFU commented on issue #41339:
URL: https://github.com/apache/arrow/issues/41339#issuecomment-2074922591


   Aha, I use master code and run `ReadInBatches` in 
`cpp/examples/arrow/parquet_read_write`'s `ReadInBatches`:
   
   ```c++
   arrow::Status ReadInBatches(std::string path_to_file) {
     // #include "arrow/io/api.h"
     // #include "arrow/parquet/arrow/reader.h"
   
     arrow::MemoryPool* pool = arrow::default_memory_pool();
   
     // Configure general Parquet reader settings
     auto reader_properties = parquet::ReaderProperties(pool);
     reader_properties.set_buffer_size(4096 * 4);
     reader_properties.enable_buffered_stream();
   
     // Configure Arrow-specific Parquet reader settings
     auto arrow_reader_props = parquet::ArrowReaderProperties();
     arrow_reader_props.set_batch_size(3);  // default 64 * 1024
     arrow_reader_props.set_use_threads(true);
   
     parquet::arrow::FileReaderBuilder reader_builder;
     ARROW_RETURN_NOT_OK(
         reader_builder.OpenFile(path_to_file, /*memory_map=*/true, 
reader_properties));
     reader_builder.memory_pool(pool);
     reader_builder.properties(arrow_reader_props);
   
     std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
     ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
   
     std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
     ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader));
   
     std::shared_ptr<::arrow::RecordBatch> batch;
     while (rb_reader->ReadNext(&batch).ok() && batch != nullptr) {
       std::cout << "Read:" << batch->ToString() << '\n';
     }
   
     // for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch : 
*rb_reader) {
     //   if (!maybe_batch.ok()) {
     //     std::cout << "Error reading batch: " << 
maybe_batch.status().message() << std::endl;
     //   } else {
     //       std::shared_ptr<arrow::RecordBatch> batch = 
maybe_batch.ValueOrDie();
     //       std::cout << "Read batch with " << batch->num_rows() << " rows" 
<< std::endl;
     //   }
     // }
     return arrow::Status::OK();
   }
   
   arrow::Status RunExamples(std::string path_to_file) {
   //  ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
   //  ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
   //  ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));
     ARROW_RETURN_NOT_OK(ReadInBatches(path_to_file));
     return arrow::Status::OK();
   }
   ```
   
   This doesn't crash. I'm running on My M1 MacOS and master branch. Would you 
mind provide some configs?
   
   By the way, the stack below it's a little confusing, 🤔 why 
`parquet::PhysicalType<(parquet::Type::type)5>` calls 
`parquet::PhysicalType<(parquet::Type::type)7>` ...
   
   ```
   
   #8  0x00007fb7a284f5cd in parquet::internal::(anonymous 
namespace)::TypedRecordReader<parquet::PhysicalType<(parquet::Type::type)7> 
>::bytes_for_values(long) const [clone .isra.1197] () from 
/lib64/libparquet.so.1500
   #9  0x00007fb7a28512bd in parquet::internal::(anonymous 
namespace)::TypedRecordReader<parquet::PhysicalType<(parquet::Type::type)5> 
>::ReserveValues(long) () from /lib64/libparquet.so.1500
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [I] [C++] ReadNext in arrow::RecordBatchReader returns invalid status on second or subsequent items [arrow]

Reply via email to