pitrou commented on code in PR #37868:
URL: https://github.com/apache/arrow/pull/37868#discussion_r1403420375


##########
cpp/src/arrow/filesystem/filesystem.h:
##########
@@ -60,7 +60,6 @@ struct ARROW_EXPORT FileInfo : public 
util::EqualityComparable<FileInfo> {
 
   explicit FileInfo(std::string path, FileType type = FileType::Unknown)
       : path_(std::move(path)), type_(type) {}
-

Review Comment:
   Same here.



##########
python/pyarrow/_dataset.pyx:
##########
@@ -96,27 +96,33 @@ def _get_parquet_symbol(name):
     return _dataset_pq and getattr(_dataset_pq, name)
 
 
-cdef CFileSource _make_file_source(object file, FileSystem filesystem=None):
+cdef CFileSource _make_file_source(object file, FileSystem filesystem=None, 
int64_t file_size=-1):
 
     cdef:
         CFileSource c_source
         shared_ptr[CFileSystem] c_filesystem
+        CFileInfo c_info
         c_string c_path
         shared_ptr[CRandomAccessFile] c_file
         shared_ptr[CBuffer] c_buffer
 
     if isinstance(file, Buffer):
         c_buffer = pyarrow_unwrap_buffer(file)
         c_source = CFileSource(move(c_buffer))
-
     elif _is_path_like(file):
         if filesystem is None:
             raise ValueError("cannot construct a FileSource from "
                              "a path without a FileSystem")
         c_filesystem = filesystem.unwrap()
         c_path = tobytes(_stringify_path(file))
-        c_source = CFileSource(move(c_path), move(c_filesystem))
 
+        if file_size >= 0:
+            c_size = file_size
+            info = FileInfo(c_path, size=c_size)
+            c_info = info.unwrap()

Review Comment:
   Just a suggestion, but I think this could simply be:
   ```suggestion
               c_info = CFileInfo(c_path, CFileType_File)
               c_info.set_size(file_size)
   ```
   



##########
cpp/src/arrow/dataset/file_base.h:
##########
@@ -54,7 +54,6 @@ class ARROW_DS_EXPORT FileSource : public 
util::EqualityComparable<FileSource> {
       : file_info_(std::move(path)),
         filesystem_(std::move(filesystem)),
         compression_(compression) {}
-

Review Comment:
   This seems like a gratuitous change, could you please undo it?



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -981,6 +981,48 @@ def test_make_fragment(multisourcefs):
         assert row_group_fragment.row_groups == [0]
 
 
[email protected]
[email protected]
+def test_make_fragment_with_size(s3_example_simple):

Review Comment:
   @eeroel Sorry for the delay. Yes, this sounds ok to me. Can you add a 
comment explaining this?



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -981,6 +981,59 @@ def test_make_fragment(multisourcefs):
         assert row_group_fragment.row_groups == [0]
 
 
[email protected]
[email protected]
+def test_make_fragment_with_size(s3_example_simple):
+    table, path, fs, uri, host, port, access_key, secret_key = 
s3_example_simple
+
+    file_format = ds.ParquetFileFormat()
+    paths = [path]
+
+    fragments = [file_format.make_fragment(path, fs)
+                 for path in paths]
+    dataset = ds.FileSystemDataset(
+        fragments, format=file_format, schema=table.schema, filesystem=fs
+    )
+
+    tbl = dataset.to_table()
+    assert tbl.equals(table)
+
+    # true sizes -> works
+    sizes_true = [dataset.filesystem.get_file_info(x).size for x in 
dataset.files]
+    fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
+                           for path, size in zip(paths, sizes_true)]
+    dataset_with_size = ds.FileSystemDataset(
+        fragments_with_size, format=file_format, schema=table.schema, 
filesystem=fs
+    )
+    tbl = dataset.to_table()
+    assert tbl.equals(table)
+
+    # too small sizes -> error
+    sizes_toosmall = [1 for path in paths]
+    fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
+                           for path, size in zip(paths, sizes_toosmall)]
+
+    dataset_with_size = ds.FileSystemDataset(
+        fragments_with_size, format=file_format, schema=table.schema, 
filesystem=fs
+    )
+
+    with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 
bytes'):
+        table = dataset_with_size.to_table()
+
+    # too large sizes -> error
+    sizes_toolarge = [1000000 for path in paths]
+    fragments_with_size = [file_format.make_fragment(path, fs, file_size=size)
+                           for path, size in zip(paths, sizes_toolarge)]
+
+    dataset_with_size = ds.FileSystemDataset(
+        fragments_with_size, format=file_format, schema=table.schema, 
filesystem=fs
+    )
+
+    # invalid range
+    with pytest.raises(OSError, match='HTTP status 416'):

Review Comment:
   I'm not sure the exact HTTP error is relevant here? Though we can probably 
keep it until it's not reliable anymore :-)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to