Disclosure: We back-ported few parquet related patched to 1.4 so, not was not sure to post it on the list, but seen another thread on mailing list, and it seems identically wrong/missing behaviour.

Following is smallest re-production, on 1 VIEW on 2 directories with CSV data.

Query via view :
This is the issue => Touches 'dir3'='9' even if it should not.
select n.ts, n.ri, n.rp, n.ui, n.up from hdfs.views.nat_add n
     where n.dir0='2015' and n.dir1='12' and n.dir2='24' and n.dir3='08'
       and n.ui='93e601f1'
       and n.ts>=1450944001798 and n.ts<=1450944466491
limit 10
00-00    Screen : rowType = RecordType(BIGINT ts, ANY ri, INTEGER rp, ANY ui, 
INTEGER up): rowcount = 10.0, cumulative cost = {6617417.125326719 rows, 
7.901950522832485E7 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175517
00-01      Project(ts=[$0], ri=[$1], rp=[$2], ui=[$3], up=[$4]) : rowType = 
RecordType(BIGINT ts, ANY ri, INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, 
cumulative cost = {6617416.125326719 rows, 7.901950422832485E7 cpu, 0.0 io, 0.0 
network, 0.0 memory}, id = 175516
00-02        SelectionVectorRemover : rowType = RecordType(BIGINT ts, ANY ri, 
INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, cumulative cost = 
{6617416.125326719 rows, 7.901950422832485E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175515
00-03          Limit(fetch=[10]) : rowType = RecordType(BIGINT ts, ANY ri, 
INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, cumulative cost = 
{6617406.125326719 rows, 7.901949422832485E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175514
00-04            Project(ts=[$4], ri=[$8], rp=[$9], ui=[$6], up=[$7]) : rowType 
= RecordType(BIGINT ts, ANY ri, INTEGER rp, ANY ui, INTEGER up): rowcount = 
25.12532671875, cumulative cost = {6617396.125326719 rows, 7.901945422832485E7 
cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175513
00-05              SelectionVectorRemover : rowType = RecordType(ANY dir0, ANY 
dir1, ANY dir2, ANY dir3, BIGINT ts, INTEGER tp, ANY ui, INTEGER up, ANY ri, 
INTEGER rp, BIGINT fb): rowcount = 25.12532671875, cumulative cost = 
{6617396.125326719 rows, 7.901945422832485E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175512
00-06                Filter(condition=[AND(=($0, '2015'), =($1, '12'), =($2, '24'), 
=($3, '08'), =($6, '93e601f1'), >=($4, 1450944001798), <=($4, 1450944466491))]) 
: rowType = RecordType(ANY dir0, ANY dir1, ANY dir2, ANY dir3, BIGINT ts, INTEGER tp, 
ANY ui, INTEGER up, ANY ri, INTEGER rp, BIGINT fb): rowcount = 25.12532671875, 
cumulative cost = {6617371.0 rows, 7.901942910299812E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175511
00-07                  UnionAll(all=[true]) : rowType = RecordType(ANY dir0, 
ANY dir1, ANY dir2, ANY dir3, BIGINT ts, INTEGER tp, ANY ui, INTEGER up, ANY 
ri, INTEGER rp, BIGINT fb): rowcount = 1323474.0, cumulative cost = {5293897.0 
rows, 7.27910856E7 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175510
00-09                    Project(dir0=[$1], dir1=[$2], dir2=[$3], dir3=[$4], 
ts=[CAST($5):BIGINT], tp=[CAST($6):INTEGER], ui=[$7], up=[CAST($8):INTEGER], 
ri=[$9], rp=[CAST($10):INTEGER], fb=[CAST($11):BIGINT]) : rowType = 
RecordType(ANY dir0, ANY dir1, ANY dir2, ANY dir3, BIGINT ts, INTEGER tp, ANY 
ui, INTEGER up, ANY ri, INTEGER rp, BIGINT fb): rowcount = 1323473.0, 
cumulative cost = {3970419.0 rows, 7.1467542E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175504
00-11                      Project(dir5=[$4], dir0=[$1], dir1=[$0], dir2=[$2], 
dir3=[$3], ITEM=[ITEM($5, 0)], ITEM6=[ITEM($5, 1)], ITEM7=[ITEM($5, 2)], 
ITEM8=[ITEM($5, 3)], ITEM9=[ITEM($5, 4)], ITEM10=[ITEM($5, 5)], 
ITEM11=[ITEM($5, 9)]) : rowType = RecordType(ANY dir5, ANY dir0, ANY dir1, ANY 
dir2, ANY dir3, ANY ITEM, ANY ITEM6, ANY ITEM7, ANY ITEM8, ANY ITEM9, ANY 
ITEM10, ANY ITEM11): rowcount = 1323473.0, cumulative cost = {2646946.0 rows, 
4.4998082E7 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175503
00-13                        Scan(groupscan=[EasyGroupScan 
[selectionRoot=hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/current, 
numFiles=2, columns=[`dir5`, `dir0`, `dir1`, `dir2`, `dir3`, `columns`[0], 
`columns`[1], `columns`[2], `columns`[3], `columns`[4], `columns`[5], 
`columns`[9]], 
files=[hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/current/2015/12/24/08/in/na/na_0.1454686813882.csv,
 
hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/current/2015/12/24/09/in/na/na_0.1454686813883.csv]]])
 : rowType = RecordType(ANY dir1, ANY dir0, ANY dir2, ANY dir3, ANY dir5, ANY 
columns): rowcount = 1323473.0, cumulative cost = {1323473.0 rows, 7940838.0 
cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175502
00-08                    Project(dir0=['open'], dir1=[''], dir2=[''], 
dir3=[''], ts=[CAST($1):BIGINT], tp=[CAST($2):INTEGER], ui=[$3], 
up=[CAST($4):INTEGER], ri=[$5], rp=[CAST($6):INTEGER], fb=[CAST($7):BIGINT]) : 
rowType = RecordType(CHAR(4) dir0, CHAR(0) dir1, CHAR(0) dir2, CHAR(0) dir3, 
BIGINT ts, INTEGER tp, ANY ui, INTEGER up, ANY ri, INTEGER rp, BIGINT fb): 
rowcount = 1.0, cumulative cost = {4.0 rows, 69.6 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175509
00-10                      SelectionVectorRemover : rowType = RecordType(ANY 
dir0, ANY ITEM, ANY ITEM2, ANY ITEM3, ANY ITEM4, ANY ITEM5, ANY ITEM6, ANY 
ITEM7): rowcount = 1.0, cumulative cost = {3.0 rows, 33.6 cpu, 0.0 io, 0.0 
network, 0.0 memory}, id = 175508
00-12                        Filter(condition=[=($0, 'na')]) : rowType = 
RecordType(ANY dir0, ANY ITEM, ANY ITEM2, ANY ITEM3, ANY ITEM4, ANY ITEM5, ANY 
ITEM6, ANY ITEM7): rowcount = 1.0, cumulative cost = {2.0 rows, 32.6 cpu, 0.0 
io, 0.0 network, 0.0 memory}, id = 175507
00-14                          Project(dir0=[$0], ITEM=[ITEM($1, 0)], 
ITEM2=[ITEM($1, 1)], ITEM3=[ITEM($1, 2)], ITEM4=[ITEM($1, 3)], ITEM5=[ITEM($1, 
4)], ITEM6=[ITEM($1, 5)], ITEM7=[ITEM($1, 9)]) : rowType = RecordType(ANY dir0, 
ANY ITEM, ANY ITEM2, ANY ITEM3, ANY ITEM4, ANY ITEM5, ANY ITEM6, ANY ITEM7): 
rowcount = 1.0, cumulative cost = {1.0 rows, 28.0 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 175506
00-15                            Scan(groupscan=[EasyGroupScan 
[selectionRoot=hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/open, numFiles=1, 
columns=[`dir0`, `columns`[0], `columns`[1], `columns`[2], `columns`[3], 
`columns`[4], `columns`[5], `columns`[9]], 
files=[hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/open/header.csv]]]) : 
rowType = RecordType(ANY dir0, ANY columns): rowcount = 1.0, cumulative cost = 
{0.0 rows, 0.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 175505


Query on Original Table :
select cast(n.columns[0] as BIGINT) as ts, n.columns[4] as ri, 
cast(n.columns[5] as INTEGER) as rp, n.columns[2] as ui, cast(n.columns[3] as 
INTEGER) as up from hdfs.dp.`data/nat/current/` n
     where n.dir0='2015' and n.dir1='12' and n.dir2='24' and n.dir3='08' and 
dir5='na'
       and n.columns[2]='93e601f1'
       and cast(n.columns[0] as BIGINT)>=1450944001798 and cast(n.columns[0] as 
BIGINT)<=1450944466491
limit 10
00-00    Screen : rowType = RecordType(BIGINT ts, ANY ri, INTEGER rp, ANY ui, 
INTEGER up): rowcount = 10.0, cumulative cost = {3762268.1423775004 rows, 
3.8507763137538746E7 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 35715
00-01      Project(ts=[$0], ri=[$1], rp=[$2], ui=[$3], up=[$4]) : rowType = 
RecordType(BIGINT ts, ANY ri, INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, 
cumulative cost = {3762267.1423775004 rows, 3.8507762137538746E7 cpu, 0.0 io, 
0.0 network, 0.0 memory}, id = 35714
00-02        SelectionVectorRemover : rowType = RecordType(BIGINT ts, ANY ri, 
INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, cumulative cost = 
{3762267.1423775004 rows, 3.8507762137538746E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 35713
00-03          Limit(fetch=[10]) : rowType = RecordType(BIGINT ts, ANY ri, 
INTEGER rp, ANY ui, INTEGER up): rowcount = 10.0, cumulative cost = 
{3762257.1423775004 rows, 3.8507752137538746E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 35712
00-04            Project(ts=[CAST($5):BIGINT], ri=[$6], rp=[CAST($7):INTEGER], 
ui=[$8], up=[CAST($9):INTEGER]) : rowType = RecordType(BIGINT ts, ANY ri, 
INTEGER rp, ANY ui, INTEGER up): rowcount = 3.5711887499999997, cumulative cost 
= {3762247.1423775004 rows, 3.8507712137538746E7 cpu, 0.0 io, 0.0 network, 0.0 
memory}, id = 35711
00-05              SelectionVectorRemover : rowType = RecordType(ANY dir0, ANY 
dir1, ANY dir2, ANY dir3, ANY dir5, ANY ITEM, ANY ITEM6, ANY ITEM7, ANY ITEM8, 
ANY ITEM9): rowcount = 3.5711887499999997, cumulative cost = {3762243.57118875 
rows, 3.850766928327375E7 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 35710
00-06                Filter(condition=[AND(=($0, '2015'), =($1, '12'), =($2, '24'), 
=($3, '08'), =($4, 'na'), =($8, '93e601f1'), >=(CAST($5):BIGINT, 1450944001798), 
<=(CAST($5):BIGINT, 1450944466491))]) : rowType = RecordType(ANY dir0, ANY dir1, 
ANY dir2, ANY dir3, ANY dir5, ANY ITEM, ANY ITEM6, ANY ITEM7, ANY ITEM8, ANY ITEM9): 
rowcount = 3.5711887499999997, cumulative cost = {3762240.0 rows, 3.8507665712085E7 
cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 35709
00-07                  Project(dir0=[$0], dir1=[$3], dir2=[$4], dir3=[$1], 
dir5=[$2], ITEM=[ITEM($5, 0)], ITEM6=[ITEM($5, 4)], ITEM7=[ITEM($5, 5)], 
ITEM8=[ITEM($5, 2)], ITEM9=[ITEM($5, 3)]) : rowType = RecordType(ANY dir0, ANY 
dir1, ANY dir2, ANY dir3, ANY dir5, ANY ITEM, ANY ITEM6, ANY ITEM7, ANY ITEM8, 
ANY ITEM9): rowcount = 1254080.0, cumulative cost = {2508160.0 rows, 3.260608E7 
cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 35708
00-08                    Scan(groupscan=[EasyGroupScan 
[selectionRoot=hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/current, 
numFiles=1, columns=[`dir0`, `dir1`, `dir2`, `dir3`, `dir5`, `columns`[0], 
`columns`[4], `columns`[5], `columns`[2], `columns`[3]], 
files=[hdfs://drssc9a2.st.demo.com:8020/dp/data/nat/current/2015/12/24/08/in/na/na_0.1454686813882.csv]]])
 : rowType = RecordType(ANY dir0, ANY dir3, ANY dir5, ANY dir1, ANY dir2, ANY 
columns): rowcount = 1254080.0, cumulative cost = {1254080.0 rows, 7524480.0 
cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 35707



VIEW Definition:
create or replaceview hdfs.views.nat_addas select dir0, dir1, dir2, dir3, 
cast(n.columns[0]as BIGINT)as ts, cast(n.columns[1]as INTEGER)as tp, 
n.columns[2]as ui, cast(n.columns[3]as INTEGER)as up, n.columns[4]as ri, 
cast(n.columns[5]as INTEGER)as rp, cast(n.columns[9]as BIGINT)as fb
from hdfs.ee.`data/nat/current/` n
where dir5='na' union all select 'open' as dir0, '' as dir1, '' as dir2, '' as 
dir3, cast(n.columns[0]as BIGINT)as ts, cast(n.columns[1]as INTEGER)as tp, 
n.columns[2]as ui, cast(n.columns[3]as INTEGER)as up, n.columns[4]as ri, 
cast(n.columns[5]as INTEGER)as rp, cast(n.columns[9]as BIGINT)as fb
from hdfs.ee.`data/nat/open/` n
where dir0='na';

1) Can somebody pitch in help us categorize this behaviour?
2) Does using "UNION ALL" is causing this?

Thanks in advance for any pointers.



Reply via email to