mapleFU commented on PR #38360:
URL: https://github.com/apache/arrow/pull/38360#issuecomment-1801640686
```python
def __init__(self, path_or_paths, filesystem=None, schema=None,
metadata=None, split_row_groups=False, validate_schema=True,
filters=None, metadata_nthreads=None, read_dictionary=None,
memory_map=False, buffer_size=0, partitioning="hive",
use_legacy_dataset=None, pre_buffer=True,
coerce_int96_timestamp_unit=None,
thrift_string_size_limit=None,
thrift_container_size_limit=None):
if partitioning != "hive":
raise ValueError(
'Only "hive" for hive-like partitioning is supported when '
'using use_legacy_dataset=True')
if metadata_nthreads is not None:
warnings.warn(
"Specifying the 'metadata_nthreads' argument is deprecated
as "
"of pyarrow 8.0.0, and the argument will be removed in a "
"future version",
FutureWarning, stacklevel=2,
)
else:
metadata_nthreads = 1
self._ds_metadata = _ParquetDatasetMetadata()
a_path = path_or_paths
if isinstance(a_path, list):
a_path = a_path[0]
self._ds_metadata.fs, _ = _get_filesystem_and_path(filesystem,
a_path)
if isinstance(path_or_paths, list):
self.paths = [_parse_uri(path) for path in path_or_paths]
else:
self.paths = _parse_uri(path_or_paths)
self._ds_metadata.read_dictionary = read_dictionary
self._ds_metadata.memory_map = memory_map
self._ds_metadata.buffer_size = buffer_size
(self._pieces,
self._partitions,
self._common_metadata_path,
self._metadata_path) = _make_manifest(
path_or_paths, self._fs, metadata_nthreads=metadata_nthreads,
open_file_func=partial(_open_dataset_file, self._ds_metadata)
)
if self._common_metadata_path is not None:
with self._fs.open(self._common_metadata_path) as f:
self._ds_metadata.common_metadata = read_metadata(
f,
memory_map=memory_map
)
else:
self._ds_metadata.common_metadata = None
if metadata is not None:
warnings.warn(
"Specifying the 'metadata' argument with
'use_legacy_dataset="
"True' is deprecated as of pyarrow 8.0.0.",
FutureWarning, stacklevel=2)
if metadata is None and self._metadata_path is not None:
with self._fs.open(self._metadata_path) as f:
self._metadata = read_metadata(f, memory_map=memory_map)
else:
self._metadata = metadata
if schema is not None:
warnings.warn(
"Specifying the 'schema' argument with 'use_legacy_dataset="
"True' is deprecated as of pyarrow 8.0.0. You can still "
"specify it in combination with 'use_legacy_dataet=False', "
"but in that case you need to specify a pyarrow.Schema "
"instead of a ParquetSchema.",
FutureWarning, stacklevel=2)
self._schema = schema
self.split_row_groups = split_row_groups
if split_row_groups:
raise NotImplementedError("split_row_groups not yet implemented")
if filters is not None:
if hasattr(filters, "cast"):
raise TypeError(
"Expressions as filter not supported for legacy dataset")
filters = _check_filters(filters)
self._filter(filters)
if validate_schema:
self.validate_schemas()
```
@AlenkaF I'm not familiar with legacy dataset, seems it doesn't handle
argument like `pre_buffer` ... etc, do we need just ignore it?
For `pq.ParquetFile`, we need `iter_batches(..)` or using other way to read
it for trigger the CRC error
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]