[ 
https://issues.apache.org/jira/browse/HIVE-29616?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

jinqi long updated HIVE-29616:
------------------------------
    Description: 
The logic in ExprProcFactory#findSourceColumn resolves source columns from 
TopOps by matching table and field aliases.  If a match is found, it returns 
the result directly.  This implementation fails in scenarios involving multiple 
subqueries with identical table aliases (e.g., in a UNION statement).  Because 
the search returns the first match it encounters, it may link to the wrong 
source column from a different subquery branch, leading to incorrect lineage. 
for example:
{code:java}
create table table_3 as
select id1 from table_1 t1 where t1.id2 = 1
union all
select id1 from table_2 t1 where t1.id2 = 2;{code}
 
{code:java}
The current result is:
{
    "version": "1.0",
    "engine": "tez",
    "database": "default",
    "hash": "24a0f860f60a1b7d5f350fd8eb164a37",
    "queryText": "create table table_3 as\nselect id1 from table_1 t1 where 
t1.id2 = 1\nunion all\nselect id1 from table_2 t1 where t1.id2 = 2",
    "edges": [
        {
            "sources": [
                1,
                2
            ],
            "targets": [
                0
            ],
            "expression": "id1",
            "edgeType": "PROJECTION"
        },
        {
            "sources": [
                3
            ],
            "targets": [
                0
            ],
            "expression": "(t1.id2 = 1)",
            "edgeType": "PREDICATE"
        },
        {
            "sources": [
                3
            ],
            "targets": [
                0
            ],
            "expression": "(t1.id2 = 2)",
            "edgeType": "PREDICATE"
        }
    ],
    "vertices": [
        {
            "id": 0,
            "vertexType": "COLUMN",
            "vertexId": "default.table_3.id1"
        },
        {
            "id": 1,
            "vertexType": "COLUMN",
            "vertexId": "default.table_1.id1"
        },
        {
            "id": 2,
            "vertexType": "COLUMN",
            "vertexId": "default.table_2.id1"
        },
        {
            "id": 3,
            "vertexType": "COLUMN",
            "vertexId": "default.table_1.id2"
        }
    ]
}{code}
The correct result should be two PREDICATE edges:
"sources": [default.table_1.id2],"targets": [default.table_3.id1]
"sources": [default.table_2.id2],"targets": [default.table_3.id1]
and on PROJECTION edge:
"sources": [default.table_1.id1,default.table_2.id1],"targets": 
[default.table_3.id1]

 

  was:
The logic in ExprProcFactory#findSourceColumn resolves source columns from 
TopOps by matching table and field aliases.  If a match is found, it returns 
the result directly.  This implementation fails in scenarios involving multiple 
subqueries with identical table aliases (e.g., in a UNION statement).  Because 
the search returns the first match it encounters, it may link to the wrong 
source column from a different subquery branch, leading to incorrect lineage. 
for example:
{code:java}
create table table_3 as
select id1 from table_1 t1 where t1.id2 = 1
union all
select id1 from table_2 t1 where t1.id2 = 2;{code}


> Incorrect column lineage when multiple subqueries with identical table aliases
> ------------------------------------------------------------------------------
>
>                 Key: HIVE-29616
>                 URL: https://issues.apache.org/jira/browse/HIVE-29616
>             Project: Hive
>          Issue Type: Bug
>          Components: lineage
>    Affects Versions: 1.1.0
>            Reporter: jinqi long
>            Priority: Major
>             Fix For: 4.3.0
>
>
> The logic in ExprProcFactory#findSourceColumn resolves source columns from 
> TopOps by matching table and field aliases.  If a match is found, it returns 
> the result directly.  This implementation fails in scenarios involving 
> multiple subqueries with identical table aliases (e.g., in a UNION 
> statement).  Because the search returns the first match it encounters, it may 
> link to the wrong source column from a different subquery branch, leading to 
> incorrect lineage. for example:
> {code:java}
> create table table_3 as
> select id1 from table_1 t1 where t1.id2 = 1
> union all
> select id1 from table_2 t1 where t1.id2 = 2;{code}
>  
> {code:java}
> The current result is:
> {
>     "version": "1.0",
>     "engine": "tez",
>     "database": "default",
>     "hash": "24a0f860f60a1b7d5f350fd8eb164a37",
>     "queryText": "create table table_3 as\nselect id1 from table_1 t1 where 
> t1.id2 = 1\nunion all\nselect id1 from table_2 t1 where t1.id2 = 2",
>     "edges": [
>         {
>             "sources": [
>                 1,
>                 2
>             ],
>             "targets": [
>                 0
>             ],
>             "expression": "id1",
>             "edgeType": "PROJECTION"
>         },
>         {
>             "sources": [
>                 3
>             ],
>             "targets": [
>                 0
>             ],
>             "expression": "(t1.id2 = 1)",
>             "edgeType": "PREDICATE"
>         },
>         {
>             "sources": [
>                 3
>             ],
>             "targets": [
>                 0
>             ],
>             "expression": "(t1.id2 = 2)",
>             "edgeType": "PREDICATE"
>         }
>     ],
>     "vertices": [
>         {
>             "id": 0,
>             "vertexType": "COLUMN",
>             "vertexId": "default.table_3.id1"
>         },
>         {
>             "id": 1,
>             "vertexType": "COLUMN",
>             "vertexId": "default.table_1.id1"
>         },
>         {
>             "id": 2,
>             "vertexType": "COLUMN",
>             "vertexId": "default.table_2.id1"
>         },
>         {
>             "id": 3,
>             "vertexType": "COLUMN",
>             "vertexId": "default.table_1.id2"
>         }
>     ]
> }{code}
> The correct result should be two PREDICATE edges:
> "sources": [default.table_1.id2],"targets": [default.table_3.id1]
> "sources": [default.table_2.id2],"targets": [default.table_3.id1]
> and on PROJECTION edge:
> "sources": [default.table_1.id1,default.table_2.id1],"targets": 
> [default.table_3.id1]
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to