bkietz commented on a change in pull request #9685: URL: https://github.com/apache/arrow/pull/9685#discussion_r593289272
########## File path: python/pyarrow/_dataset.pyx ########## @@ -1383,8 +1385,22 @@ cdef class CsvFileFormat(FileFormat): def parse_options(self, ParseOptions parse_options not None): self.csv_format.parse_options = parse_options.options + @property + def compression(self): + return None + + @compression.setter + def compression(self, compression): + if compression is None: + self.csv_format.compression = CCompressionType_UNCOMPRESSED + if isinstance(compression, str): + compression = Codec(compression) + self.csv_format.compression = \ + (<Codec> compression).unwrap().compression_type() Review comment: ```suggestion elif isinstance(compression, str): self.csv_format.compression = _ensure_compression(compression) elif isinstance(compression, Codec): self.csv_format.compression = \ (<Codec> compression).unwrap().compression_type() else: raise TypeError(f'Cannot set compression with value of type {type(compression)}') `````` ########## File path: cpp/src/arrow/dataset/file_csv.cc ########## @@ -105,18 +106,33 @@ static inline Result<std::shared_ptr<csv::StreamingReader>> OpenReader( const FileSource& source, const CsvFileFormat& format, const std::shared_ptr<ScanOptions>& scan_options = nullptr, MemoryPool* pool = default_memory_pool()) { - ARROW_ASSIGN_OR_RAISE(auto input, source.Open()); - auto reader_options = GetReadOptions(format); - ARROW_ASSIGN_OR_RAISE(auto first_block, input->ReadAt(0, reader_options.block_size)); - RETURN_NOT_OK(input->Seek(0)); - const auto& parse_options = format.parse_options; + std::shared_ptr<io::InputStream> input; + // If applicable, the buffer that backs first_block + std::shared_ptr<Buffer> first_block_buf; + util::string_view first_block; + ARROW_ASSIGN_OR_RAISE(auto file, source.Open()); + if (format.compression == Compression::UNCOMPRESSED) { + input = file; Review comment: (For follow up, maybe ARROW-8981) maybe it'd be more useful to encapsulate compressed FIleSources with an overload of Open() ```suggestion ARROW_ASSIGN_OR_RAISE(std::shared_ptr<io::InputStream> file, source.Open(format.compression)); ``` rather than the (currently ignored) FileSource::compression property ########## File path: python/pyarrow/_dataset.pyx ########## @@ -1363,10 +1363,12 @@ cdef class CsvFileFormat(FileFormat): cdef: CCsvFileFormat* csv_format - def __init__(self, ParseOptions parse_options=None): + def __init__(self, ParseOptions parse_options=None, compression=None): self.init(shared_ptr[CFileFormat](new CCsvFileFormat())) if parse_options is not None: self.parse_options = parse_options + if compression: + self.compression = compression Review comment: Nit: is it necessary to check for `None` twice? ```suggestion self.compression = compression ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org