[
https://issues.apache.org/jira/browse/ARROW-16980?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Richard Tia updated ARROW-16980:
--------------------------------
Description:
SQL
{code:java}
SELECT l_returnflag, l_linestatus FROM lineitem{code}
Substrait Plan:
{code:java}
"""
{
"extensionUris": [],
"extensions": [],
"relations": [{
"root": {
"input": {
"project": {
"common": {
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY",
"L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX",
"L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE",
"L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i32": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 1,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 1,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 25,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 10,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"varchar": {
"length": 44,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"local_files": {
"items": [
{
"uri_file": "file://FILENAME_PLACEHOLDER",
"format": "FILE_FORMAT_PARQUET"
}
]
}
}
},
"expressions": [{
"selection": {
"directReference": {
"structField": {
"field": 7
}
},
"rootReference": {
}
}
}, {
"selection": {
"directReference": {
"structField": {
"field": 6
}
},
"rootReference": {
}
}
}]
}
},
"names": ["L_TAX", "L_DISCOUNT"]
}
}],
"expectedTypeUrls": []
}
""" {code}
Result:
{code:java}
pyarrow.Table
L_TAX: decimal128(19, 0)
L_DISCOUNT: decimal128(19, 0)
----
L_TAX: [[null,null,null,null,null,null,null,null,null,null]]
L_DISCOUNT: [[null,null,null,null,null,null,null,null,null,null]] {code}
Reproduction Steps:
{code:java}
import pyarrow as pa
import pyarrow.substrait as substrait
from pyarrow import json as pyarrow_json
from pyarrow.lib import tobytes
substrait_query = <code block below>
json_file_path = os.path.join(<path>, 'lineitem.json')
arrow_data_path_parquet = os.path.join(str(tmpdir), 'substrait_data.parquet')
substrait_query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER",
arrow_data_path_parquet))
# Save lineitem.json into IPC arrow binary file
table = pyarrow_json.read_json(json_file_path) pq.write_table(pa_table,
arrow_data_path_parquet)
# Run the substrait query plan
buf = pa._substrait._parse_json_plan(substrait_query)
reader = substrait.run_query(buf)
result = reader.read_all()
print(result)
{code}
lineitem.json is attached
was:
SQL
{code:java}
SELECT l_returnflag, l_linestatus FROM lineitem{code}
substrait plan type info for l_returnflag:
{code:java}
{
"fixedChar": {
"length": 1,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}{code}
fixedChar is an extension type.
Error:
{code:java}
pyarrow/table.pxi:1223: in pyarrow.lib.ChunkedArray.chunks.__get__
???
pyarrow/table.pxi:1241: in iterchunks
???
pyarrow/table.pxi:1185: in pyarrow.lib.ChunkedArray.chunk
???
pyarrow/public-api.pxi:200: in pyarrow.lib.pyarrow_wrap_array
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> ???
E AttributeError: 'pyarrow.lib.BaseExtensionType' object has no attribute
'__arrow_ext_class__'
{code}
Reproduction Steps:
{code:java}
import pyarrow as pa
import pyarrow.substrait as substrait
from pyarrow import json as pyarrow_json
from pyarrow.lib import tobytes
substrait_query = <code block below>
json_file_path = os.path.join(<path>, 'lineitem.json')
arrow_data_path_ipc = os.path.join(<path>, 'substrait_data.arrow')
substrait_query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER",
arrow_data_path_ipc))
# Save lineitem.json into IPC arrow binary file
table = pyarrow_json.read_json(json_file_path)
with pa.ipc.RecordBatchFileWriter(filepath, schema=table.schema,
arrow_data_path_ipc) as writer:
writer.write_table(table)
# Run the substrait query plan
buf = pa._substrait._parse_json_plan(substrait_query)
reader = substrait.run_query(buf)
result = reader.read_all()
print(result.columns[0].chunks)
{code}
lineitem.json is attached
substrait query plan:
{code:java}
"""
{
"extensionUris": [],
"extensions": [],
"relations": [{
"root": {
"input": {
"project": {
"common": {
},
"input": {
"read": {
"common": {
"direct": {
}
},
"baseSchema": {
"names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY",
"L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX",
"L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE",
"L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"],
"struct": {
"types": [{
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i64": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"i32": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"decimal": {
"scale": 0,
"precision": 19,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 1,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 1,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"date": {
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 25,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"fixedChar": {
"length": 10,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}, {
"varchar": {
"length": 44,
"typeVariationReference": 0,
"nullability": "NULLABILITY_NULLABLE"
}
}],
"typeVariationReference": 0,
"nullability": "NULLABILITY_REQUIRED"
}
},
"local_files": {
"items": [
{
"uri_file": "file://FILENAME_PLACEHOLDER"
}
]
}
}
},
"expressions": [{
"selection": {
"directReference": {
"structField": {
"field": 8
}
},
"rootReference": {
}
}
}, {
"selection": {
"directReference": {
"structField": {
"field": 9
}
},
"rootReference": {
}
}
}]
}
},
"names": ["L_RETURNFLAG", "L_LINESTATUS"]
}
}],
"expectedTypeUrls": []
} {code}
> [Python] Results of running a substrait plan against a tpch data table
> written into parquet are all null
> --------------------------------------------------------------------------------------------------------
>
> Key: ARROW-16980
> URL: https://issues.apache.org/jira/browse/ARROW-16980
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Reporter: Richard Tia
> Priority: Minor
> Attachments: lineitem.json
>
>
> SQL
> {code:java}
> SELECT l_returnflag, l_linestatus FROM lineitem{code}
>
> Substrait Plan:
> {code:java}
> """
> {
> "extensionUris": [],
> "extensions": [],
> "relations": [{
> "root": {
> "input": {
> "project": {
> "common": {
> },
> "input": {
> "read": {
> "common": {
> "direct": {
> }
> },
> "baseSchema": {
> "names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY",
> "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX",
> "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE",
> "L_RECEIPTDATE", "L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"],
> "struct": {
> "types": [{
> "i64": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "i64": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "i64": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "i32": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "decimal": {
> "scale": 0,
> "precision": 19,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "decimal": {
> "scale": 0,
> "precision": 19,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "decimal": {
> "scale": 0,
> "precision": 19,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "decimal": {
> "scale": 0,
> "precision": 19,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "fixedChar": {
> "length": 1,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "fixedChar": {
> "length": 1,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "date": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "date": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "date": {
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "fixedChar": {
> "length": 25,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "fixedChar": {
> "length": 10,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }, {
> "varchar": {
> "length": 44,
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_NULLABLE"
> }
> }],
> "typeVariationReference": 0,
> "nullability": "NULLABILITY_REQUIRED"
> }
> },
> "local_files": {
> "items": [
> {
> "uri_file": "file://FILENAME_PLACEHOLDER",
> "format": "FILE_FORMAT_PARQUET"
> }
> ]
> }
> }
> },
> "expressions": [{
> "selection": {
> "directReference": {
> "structField": {
> "field": 7
> }
> },
> "rootReference": {
> }
> }
> }, {
> "selection": {
> "directReference": {
> "structField": {
> "field": 6
> }
> },
> "rootReference": {
> }
> }
> }]
> }
> },
> "names": ["L_TAX", "L_DISCOUNT"]
> }
> }],
> "expectedTypeUrls": []
> }
> """ {code}
>
>
> Result:
> {code:java}
> pyarrow.Table
> L_TAX: decimal128(19, 0)
> L_DISCOUNT: decimal128(19, 0)
> ----
> L_TAX: [[null,null,null,null,null,null,null,null,null,null]]
> L_DISCOUNT: [[null,null,null,null,null,null,null,null,null,null]] {code}
>
> Reproduction Steps:
> {code:java}
> import pyarrow as pa
> import pyarrow.substrait as substrait
> from pyarrow import json as pyarrow_json
> from pyarrow.lib import tobytes
> substrait_query = <code block below>
> json_file_path = os.path.join(<path>, 'lineitem.json')
> arrow_data_path_parquet = os.path.join(str(tmpdir), 'substrait_data.parquet')
> substrait_query = tobytes(substrait_query.replace("FILENAME_PLACEHOLDER",
> arrow_data_path_parquet))
> # Save lineitem.json into IPC arrow binary file
> table = pyarrow_json.read_json(json_file_path) pq.write_table(pa_table,
> arrow_data_path_parquet)
> # Run the substrait query plan
> buf = pa._substrait._parse_json_plan(substrait_query)
> reader = substrait.run_query(buf)
> result = reader.read_all()
> print(result)
> {code}
> lineitem.json is attached
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)