itsdapi commented on issue #34403:
URL: https://github.com/apache/arrow/issues/34403#issuecomment-3782431622
I improved a little bit, works both with s3 and local path
```python
import os
import argparse
import pyarrow.parquet as pq
import pyarrow.dataset as ps
from tqdm import tqdm
def generate_metadata(base_path):
dataset = ps.dataset(base_path, partitioning="hive", format="parquet")
fs = dataset.filesystem
prefix = base_path.replace("s3://", "").rstrip("/")
metadata_collector = []
print(f"Found {len(dataset.files)} files.")
for f in tqdm(dataset.files, desc="Processing files for metadata"):
md = pq.read_metadata(f, filesystem=fs)
if f.startswith(prefix):
rel_path = f[len(prefix):].lstrip("/")
else:
rel_path = f.replace(prefix, "").lstrip("/")
md.set_file_path(rel_path)
metadata_collector.append(md)
if not metadata_collector:
print("No parquet files found.")
return
_meta_data_path = os.path.join(prefix, '_metadata')
_common_metadata_path = os.path.join(prefix, '_common_metadata')
physical_schema = metadata_collector[0].schema.to_arrow_schema()
print(f"Writing _metadata to {_meta_data_path}...")
pq.write_metadata(
physical_schema,
_meta_data_path,
metadata_collector=metadata_collector,
filesystem=fs
)
print(f"Writing _common_metadata to {_common_metadata_path}...")
pq.write_metadata(
physical_schema,
_common_metadata_path,
filesystem=fs
)
print("Done.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate _metadata file
for a Parquet dataset.")
parser.add_argument("base_path", help="Path to the Parquet dataset (can
be local path or S3 URI)")
args = parser.parse_args()
generate_metadata(args.base_path)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]