mapleFU commented on issue #41249:
URL: https://github.com/apache/arrow/issues/41249#issuecomment-2071474392
This works on my machine (MacOS), but reading the whole file would consumes
a long time.
```
Read Batch Count: 10000, Row Count: 10485760000, Time: 9309ms
Read Batch Count: 20000, Row Count: 20971520000, Time: 18235ms
Read Batch Count: 30000, Row Count: 31457280000, Time: 28662ms
Read Batch Count: 40000, Row Count: 41943040000, Time: 37378ms
Read Batch Count: 50000, Row Count: 52428800000, Time: 45683ms
Read Batch Count: 60000, Row Count: 62914560000, Time: 54134ms
Read Batch Count: 70000, Row Count: 73400320000, Time: 62590ms
Read Batch Count: 80000, Row Count: 83886080000, Time: 70936ms
```
I set use_thread to true and batchSize to 1024*1024, using code below:
```c++
arrow::Status ReadInBatches(std::string path_to_file) {
// #include "arrow/io/api.h"
// #include "arrow/parquet/arrow/reader.h"
arrow::MemoryPool* pool = arrow::default_memory_pool();
// Configure general Parquet reader settings
auto reader_properties = parquet::ReaderProperties(pool);
reader_properties.set_buffer_size(4096 * 4);
reader_properties.enable_buffered_stream();
// Configure Arrow-specific Parquet reader settings
auto arrow_reader_props = parquet::ArrowReaderProperties();
arrow_reader_props.set_batch_size(1024 * 1024); // default 64 * 1024
arrow_reader_props.set_use_threads(true);
parquet::arrow::FileReaderBuilder reader_builder;
ARROW_RETURN_NOT_OK(
reader_builder.OpenFile(path_to_file, /*memory_map=*/false,
reader_properties));
reader_builder.memory_pool(pool);
reader_builder.properties(arrow_reader_props);
std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
ARROW_ASSIGN_OR_RAISE(arrow_reader, reader_builder.Build());
std::shared_ptr<::arrow::RecordBatchReader> rb_reader;
ARROW_RETURN_NOT_OK(arrow_reader->GetRecordBatchReader(&rb_reader));
int64_t rowCount = 0;
int batchCount = 0;
auto startTime = std::chrono::high_resolution_clock::now();
for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch :
*rb_reader) {
// Operate on each batch...
if (!maybe_batch.ok()) {
std::cout << "Error reading batch: " << maybe_batch.status().message()
<< std::endl;
} else {
rowCount += maybe_batch.ValueOrDie()->num_rows();
}
++batchCount;
if (batchCount != 0 && batchCount % 10000 == 0) {
auto currentTime = std::chrono::high_resolution_clock::now();
std::cout << "Read Batch Count: " << batchCount << ", Row Count: " <<
rowCount
<< ", Time: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(
currentTime - startTime)
.count() << "ms\n";
}
}
std::cout << "Read Row Count: " << rowCount << '\n';
return arrow::Status::OK();
}
```
Reading the whole file may consume hundreds hours 😅. I run this program for
hour and stop it
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]