bkietz commented on a change in pull request #9685:
URL: https://github.com/apache/arrow/pull/9685#discussion_r593289272



##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1383,8 +1385,22 @@ cdef class CsvFileFormat(FileFormat):
     def parse_options(self, ParseOptions parse_options not None):
         self.csv_format.parse_options = parse_options.options
 
+    @property
+    def compression(self):
+        return None
+
+    @compression.setter
+    def compression(self, compression):
+        if compression is None:
+            self.csv_format.compression = CCompressionType_UNCOMPRESSED
+        if isinstance(compression, str):
+            compression = Codec(compression)
+        self.csv_format.compression = \
+            (<Codec> compression).unwrap().compression_type()

Review comment:
       ```suggestion
           elif isinstance(compression, str):
               self.csv_format.compression = _ensure_compression(compression)
           elif isinstance(compression, Codec):
               self.csv_format.compression = \
                   (<Codec> compression).unwrap().compression_type()
           else:
               raise TypeError(f'Cannot set compression with value of type 
{type(compression)}')
   ``````

##########
File path: cpp/src/arrow/dataset/file_csv.cc
##########
@@ -105,18 +106,33 @@ static inline 
Result<std::shared_ptr<csv::StreamingReader>> OpenReader(
     const FileSource& source, const CsvFileFormat& format,
     const std::shared_ptr<ScanOptions>& scan_options = nullptr,
     MemoryPool* pool = default_memory_pool()) {
-  ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
-
   auto reader_options = GetReadOptions(format);
-  ARROW_ASSIGN_OR_RAISE(auto first_block, input->ReadAt(0, 
reader_options.block_size));
-  RETURN_NOT_OK(input->Seek(0));
 
-  const auto& parse_options = format.parse_options;
+  std::shared_ptr<io::InputStream> input;
+  // If applicable, the buffer that backs first_block
+  std::shared_ptr<Buffer> first_block_buf;
+  util::string_view first_block;
+  ARROW_ASSIGN_OR_RAISE(auto file, source.Open());
+  if (format.compression == Compression::UNCOMPRESSED) {
+    input = file;

Review comment:
       (For follow up, maybe ARROW-8981) maybe it'd be more useful to 
encapsulate compressed FIleSources with an overload of Open()
   ```suggestion
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<io::InputStream> file, 
source.Open(format.compression));
   ```
   rather than the (currently ignored) FileSource::compression property
   

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1363,10 +1363,12 @@ cdef class CsvFileFormat(FileFormat):
     cdef:
         CCsvFileFormat* csv_format
 
-    def __init__(self, ParseOptions parse_options=None):
+    def __init__(self, ParseOptions parse_options=None, compression=None):
         self.init(shared_ptr[CFileFormat](new CCsvFileFormat()))
         if parse_options is not None:
             self.parse_options = parse_options
+        if compression:
+            self.compression = compression

Review comment:
       Nit: is it necessary to check for `None` twice?
   ```suggestion
           self.compression = compression
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to