kosiew commented on issue #1018: URL: https://github.com/apache/datafusion-python/issues/1018#issuecomment-2673258796
Managed to reproduce this without S3 ``` import os import tempfile import pyarrow as pa import pyarrow.parquet as pq import datafusion def create_parquet_file(file_path): # Create a Parquet file containing duplicate partition columns using pyarrow arrays table = pa.Table.from_arrays( [ pa.array([1, 2, 3]), pa.array([2025, 2025, 2025]), # duplicate partition field pa.array([10, 20, 30]) ], names=['id', 'year', 'value'] ) pq.write_table(table, file_path) def test_duplicate_field_error(): with tempfile.TemporaryDirectory() as tmpdir: # Create a hive-partitioned directory structure: data/year=2025/month=1/day=1 storage_dir = os.path.join(tmpdir, "data", "year=2025", "month=1", "day=1") os.makedirs(storage_dir, exist_ok=True) file_path = os.path.join(storage_dir, "data.parquet") create_parquet_file(file_path) ctx = datafusion.SessionContext() # Register the external table with hive partitioning on local storage create_table = f""" CREATE EXTERNAL TABLE data STORED AS PARQUET PARTITIONED BY (year, month, day) LOCATION '{os.path.join(tmpdir, "data")}' """ ctx.sql(create_table).collect() # Query the table. query = """ SELECT count(*) as cnt FROM data WHERE year = 2025 AND month = 1 AND day = 1 """ # Run query without expecting exception. result = ctx.sql(query).collect() # Assert expected count. Adjust extraction of count as needed. assert result[0].column("cnt")[0].as_py() == 3 ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org