braindevices opened a new issue, #38808: URL: https://github.com/apache/arrow/issues/38808
### Describe the bug, including details regarding any error messages, version, and platform. when we load the parquet partitioned dataset, we would expect the row order is the same as the writing time. However, pyarrow does not maintain this order, instead it relies on the actual file name order. Here is the test code: ``` import shutil import os t1 = pa.Table.from_pydict({"field1": ["a", "b", "c"]}) t2 = pa.Table.from_pydict( { "field1": ["d", "e", "f"], # "field2": ["x", "y", "z"] } ) path = "/tmp/test3.pq" def write_read_pq(with_meta: bool): shutil.rmtree(path, ignore_errors=True) schemas = [] metadatas = [] for _t in [t1, t2]: schemas.append(_t.schema) pq.write_to_dataset( _t, root_path=path, # version="2.6", metadata_collector=metadatas ) if with_meta: pq.write_metadata( schemas[-1], f"{path}/_common_metadata", # version="2.6" ) pq.write_metadata( schemas[-1], f"{path}/_metadata", # version="2.6", metadata_collector=metadatas ) import time import random expect = 'abcdef' for i in range(10): with_meta = True write_read_pq(with_meta=with_meta) time.sleep(random.randint(1,10)/10.0) ds = pa.dataset.dataset(path, schema=schemas[-1], format="parquet") readback1 = "".join(ds.to_table().to_pydict()["field1"]) readback2 = "".join(pq.read_table(path).to_pydict()["field1"]) if readback1 != expect: print(f"{with_meta=}, {i=}, {readback1=} != {expect}") if readback2 != expect: print("files in metdata:", [r["columns"][0]["file_path"] for r in pq.read_metadata(f'{path}/_metadata').to_dict()["row_groups"]]) print("files in dataset", [os.path.basename(i) for i in ds.files]) print(f"{with_meta=}, {i=}, {readback2=} != {expect}") ``` Since the generated partition file has automatically generated uuid, the order now is random, here is some example print out: ``` with_meta=True, i=0, readback1='defabc' != abcdef files in metdata: ['eda57dd733e94090bcf6d031892e867f-0.parquet', '387b597b66eb4a808a272423bf867d37-0.parquet'] files in dataset ['387b597b66eb4a808a272423bf867d37-0.parquet', 'eda57dd733e94090bcf6d031892e867f-0.parquet'] with_meta=True, i=0, readback2='defabc' != abcdef with_meta=True, i=1, readback1='defabc' != abcdef files in metdata: ['e39fcc9d213a4de59c3b1d4ac266abae-0.parquet', 'd1e82d1fb03841bab839a62dc36163dc-0.parquet'] files in dataset ['d1e82d1fb03841bab839a62dc36163dc-0.parquet', 'e39fcc9d213a4de59c3b1d4ac266abae-0.parquet'] with_meta=True, i=1, readback2='defabc' != abcdef with_meta=True, i=4, readback1='defabc' != abcdef files in metdata: ['f41662a082224e489edf90ad2863f835-0.parquet', 'd42a4f90228a4b5ea83f304fcff6688a-0.parquet'] files in dataset ['d42a4f90228a4b5ea83f304fcff6688a-0.parquet', 'f41662a082224e489edf90ad2863f835-0.parquet'] with_meta=True, i=4, readback2='defabc' != abcdef with_meta=True, i=6, readback1='defabc' != abcdef files in metdata: ['10a670a7b05f44fcacd0b164f43d64f1-0.parquet', '0ed9e0eeaf444430aab7747a91e470c6-0.parquet'] files in dataset ['0ed9e0eeaf444430aab7747a91e470c6-0.parquet', '10a670a7b05f44fcacd0b164f43d64f1-0.parquet'] with_meta=True, i=6, readback2='defabc' != abcdef with_meta=True, i=8, readback1='defabc' != abcdef files in metdata: ['7ee93709f24c4f2ea5bc5eb076e012bc-0.parquet', '3d0fa03bdf2e4bccb1bd5d6eed03152d-0.parquet'] files in dataset ['3d0fa03bdf2e4bccb1bd5d6eed03152d-0.parquet', '7ee93709f24c4f2ea5bc5eb076e012bc-0.parquet'] with_meta=True, i=8, readback2='defabc' != abcdef with_meta=True, i=9, readback1='defabc' != abcdef files in metdata: ['ca2f3336aee94d4186d600a347453944-0.parquet', '1ac01e6407e04741b1704fa40de4fb95-0.parquet'] files in dataset ['1ac01e6407e04741b1704fa40de4fb95-0.parquet', 'ca2f3336aee94d4186d600a347453944-0.parquet'] with_meta=True, i=9, readback2='defabc' != abcdef ``` ### Component(s) Parquet, Python -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org