itsdapi commented on issue #34403:
URL: https://github.com/apache/arrow/issues/34403#issuecomment-3782431622

   I improved a little bit, works both with s3 and local path
   
   ```python
   import os
   import argparse
   
   import pyarrow.parquet as pq
   import pyarrow.dataset as ps
   
   from tqdm import tqdm
   
   
   def generate_metadata(base_path):
       dataset = ps.dataset(base_path, partitioning="hive", format="parquet")
       fs = dataset.filesystem
       prefix = base_path.replace("s3://", "").rstrip("/")
   
       metadata_collector = []
       print(f"Found {len(dataset.files)} files.")
   
       for f in tqdm(dataset.files, desc="Processing files for metadata"):
           md = pq.read_metadata(f, filesystem=fs)
           if f.startswith(prefix):
               rel_path = f[len(prefix):].lstrip("/")
           else:
               rel_path = f.replace(prefix, "").lstrip("/")
   
           md.set_file_path(rel_path)
           metadata_collector.append(md)
   
       if not metadata_collector:
           print("No parquet files found.")
           return
   
       _meta_data_path = os.path.join(prefix, '_metadata')
       _common_metadata_path = os.path.join(prefix, '_common_metadata')
   
       physical_schema = metadata_collector[0].schema.to_arrow_schema()
   
       print(f"Writing _metadata to {_meta_data_path}...")
       pq.write_metadata(
           physical_schema, 
           _meta_data_path, 
           metadata_collector=metadata_collector, 
           filesystem=fs
       )
       
       print(f"Writing _common_metadata to {_common_metadata_path}...")
       pq.write_metadata(
           physical_schema, 
           _common_metadata_path, 
           filesystem=fs
       )
       
       print("Done.")
   
   if __name__ == "__main__":
       parser = argparse.ArgumentParser(description="Generate _metadata file 
for a Parquet dataset.")
       parser.add_argument("base_path", help="Path to the Parquet dataset (can 
be local path or S3 URI)")
       args = parser.parse_args()
   
       generate_metadata(args.base_path)
   
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to