Wenhai created ASTERIXDB-2116: --------------------------------- Summary: Very different efficiency behavior on Nested Schema. Key: ASTERIXDB-2116 URL: https://issues.apache.org/jira/browse/ASTERIXDB-2116 Project: Apache AsterixDB Issue Type: Improvement Components: STO - Storage Environment: 8 nodes with 1/4 NCs, 128GB memory in each node, Centos Reporter: Wenhai Assignee: Pouria Attachments: nestedeff.jpg
When we apply several modified TPCH queries to the nested schema of AsterixDB, we get strange results regarding very similar queries. One can check the associated figures with the arrows denoting the runtime of the following queries. Schema: {\noformat} use dataverse tpch; drop dataset NestedPartsupp if exists; drop dataset NestedPart if exists; drop type NestedPartType if exists; drop type NestedPartType_ps if exists; drop type NestedPartsuppType if exists; drop type NestedPartsuppType_l if exists; drop type NestedLineItemType if exists; create type NestedLineItemType as closed { l_orderkey: int32, l_partkey: int32, l_suppkey: int32, l_linenumber: int32, l_quantity: int32, l_extendedprice: double, l_discount: double, l_tax: double, l_returnflag: string, l_linestatus: string, l_shipdate: string, l_commitdate: string, l_receiptdate: string, l_shipinstruct: string, l_shipmode: string, l_comment: string } create type NestedPartsuppType as closed { ps_partkey: int32, ps_suppkey: int32, ps_availqty: int32, ps_supplycost: double, ps_comment: string, l: [NestedLineItemType] } create type NestedPartType as open { p_partkey: int32, p_name: string, p_mfgr: string, p_brand: string, p_type: string, p_size: int32, p_container: string, p_retailprice: double, p_comment: string, ps: [NestedPartsuppType]? } create dataset NestedPart(NestedPartType) primary key p_partkey; {\noformat} Queries: {\noformat} MQ06 use dataverse tpch; let $revenue := sum( for $t in dataset NestedPart for $s in $t.PartsuppList for $e in $s.LineitemList where $e.l_shipdate >= '1992-01-01' and $e.l_shipdate < '1994-01-01' and $e.l_discount >= 0.0 and $e.l_discount <= 0.04 and $e.l_quantity < 40 return $e.l_extendedprice * $e.l_discount ) return { "revenue": $revenue } MQ14 use dataverse tpch; count( for $p in dataset NestedPart for $ps in $p.PartsuppList for $l in $ps.LineitemList where $l.l_shipdate >= '1993-05-01' and $l.l_shipdate < '1994-01-01' return { "pt": $p.p_type, "ep": $l.l_extendedprice, "dc": $l.l_discount } ) MQ19 use dataverse tpch; count( for $p in dataset NestedPart for $ps in $p.PartsuppList for $l in $ps.LineitemList where $p.p_brand <= 'Brand#50' and (starts-with($p.p_container, "SM") or starts-with($p.p_container, "MED") or starts-with($p.p_container, "LG") or starts-with($p.p_container, "WRAP")) and $p.p_size <= 50 and $l.l_quantity >= 0 and $l.l_quantity <= 50 and (starts-with($l.l_shipmode, "TRUCK") or starts-with($l.l_shipmode, "AIR") or starts-with($l.l_shipmode, "SHIP") or starts-with($l.l_shipmode, "FOB")) and $l.l_shipinstruct = 'DELIVER IN PERSON' return { "ep": $l.l_extendedprice, "dc": $l.l_discount } ) MQ20 use dataverse tpch; count( for $p in dataset NestedPart for $ps in $p.PartsuppList for $l in $ps.LineitemList where (contains($p.p_name, "green") or contains($p.p_name, "lemon") or contains($p.p_name, "red")) and $l.l_shipdate >= "1992-01-01" and $l.l_shipdate < "1998-01-01" return { "sk":$ps.ps_suppkey, "qt":$ps.ps_availqty } ) {\noformat} -- This message was sent by Atlassian JIRA (v6.4.14#64029)