kosiew commented on issue #1018:
URL:
https://github.com/apache/datafusion-python/issues/1018#issuecomment-2673258796
Managed to reproduce this without S3
```
import os
import tempfile
import pyarrow as pa
import pyarrow.parquet as pq
import datafusion
def create_parquet_file(file_path):
# Create a Parquet file containing duplicate partition columns using
pyarrow arrays
table = pa.Table.from_arrays(
[
pa.array([1, 2, 3]),
pa.array([2025, 2025, 2025]), # duplicate partition field
pa.array([10, 20, 30])
],
names=['id', 'year', 'value']
)
pq.write_table(table, file_path)
def test_duplicate_field_error():
with tempfile.TemporaryDirectory() as tmpdir:
# Create a hive-partitioned directory structure:
data/year=2025/month=1/day=1
storage_dir = os.path.join(tmpdir, "data", "year=2025", "month=1",
"day=1")
os.makedirs(storage_dir, exist_ok=True)
file_path = os.path.join(storage_dir, "data.parquet")
create_parquet_file(file_path)
ctx = datafusion.SessionContext()
# Register the external table with hive partitioning on local storage
create_table = f"""
CREATE EXTERNAL TABLE data
STORED AS PARQUET
PARTITIONED BY (year, month, day)
LOCATION '{os.path.join(tmpdir, "data")}'
"""
ctx.sql(create_table).collect()
# Query the table.
query = """
SELECT count(*) as cnt
FROM data
WHERE year = 2025 AND month = 1 AND day = 1
"""
# Run query without expecting exception.
result = ctx.sql(query).collect()
# Assert expected count. Adjust extraction of count as needed.
assert result[0].column("cnt")[0].as_py() == 3
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]