marsupialtail commented on code in PR #13931:
URL: https://github.com/apache/arrow/pull/13931#discussion_r950904697
##########
cpp/src/arrow/dataset/file_csv.cc:
##########
@@ -184,16 +186,45 @@ static inline
Future<std::shared_ptr<csv::StreamingReader>> OpenReaderAsync(
auto span =
tracer->StartSpan("arrow::dataset::CsvFileFormat::OpenReaderAsync");
#endif
ARROW_ASSIGN_OR_RAISE(auto reader_options, GetReadOptions(format,
scan_options));
-
- ARROW_ASSIGN_OR_RAISE(auto input, source.OpenCompressed());
const auto& path = source.path();
- ARROW_ASSIGN_OR_RAISE(
+
+
+ auto actual_compression = Compression::type::UNCOMPRESSED;
+ // Guess compression from file extension
+ auto extension = fs::internal::GetAbstractPathExtension(path);
+ if (extension == "gz") {
+ actual_compression = Compression::type::GZIP;
+ } else {
+ auto maybe_compression = util::Codec::GetCompressionType(extension);
+ if (maybe_compression.ok()) {
+ ARROW_ASSIGN_OR_RAISE(actual_compression, maybe_compression);
+ }
+ }
+
+ Future<std::shared_ptr<csv::StreamingReader>> reader_fut;
+
+ if (actual_compression == Compression::type::UNCOMPRESSED) {
+ ARROW_ASSIGN_OR_RAISE(auto input, source.Open() )
+ reader_fut = DeferNotOk(input->io_context().executor()->Submit(
+ [=]() -> Future<std::shared_ptr<csv::StreamingReader>> {
+ ARROW_ASSIGN_OR_RAISE(auto temp_first_block, input->ReadAt(0,
reader_options.block_size));
+ RETURN_NOT_OK(input->Seek(0));
Review Comment:
Yes i plan to do that. This double read is very annoying. The whole logic
around what to do with the first block needs to be streamlined and optimized.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]