Please find the query plan for both queries. FYI: I am not seeing
any planning difference between these 2 queries except Cost.
/******************************** Query on GZ
****************************************/
0: jdbc:drill:> explain plan for select channelid, count(serverTime) from
dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by channelid ;
+------+------+
| text | json |
+------+------+
| 00-00 Screen
00-01 Project(channelid=[$0], EXPR$1=[$1])
00-02 UnionExchange
01-01 HashAgg(group=[{0}], EXPR$1=[$SUM0($1)])
01-02 Project(channelid=[$0], EXPR$1=[$1])
01-03 HashToRandomExchange(dist0=[[$0]])
02-01 UnorderedMuxExchange
03-01 Project(channelid=[$0], EXPR$1=[$1],
E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)])
03-02 HashAgg(group=[{0}], EXPR$1=[COUNT($1)])
03-03 Scan(groupscan=[EasyGroupScan
[selectionRoot=hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz,
numFiles=1, columns=[`channelid`, `serverTime`],
files=[hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz]]])
| {
"head" : {
"version" : 1,
"generator" : {
"type" : "ExplainHandler",
"info" : ""
},
"type" : "APACHE_DRILL_PHYSICAL",
"options" : [ ],
"queue" : 0,
"resultMode" : "EXEC"
},
"graph" : [ {
"pop" : "fs-scan",
"@id" : 196611,
"userName" : "hadoop",
"files" : [
"hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz" ],
"storage" : {
"type" : "file",
"enabled" : true,
"connection" : "hdfs://namenode:9000",
"config" : null,
"workspaces" : {
"root" : {
"location" : "/tmp/",
"writable" : true,
"defaultInputFormat" : null
},
"tmp" : {
"location" : "/tmp",
"writable" : true,
"defaultInputFormat" : null
}
},
"formats" : {
"psv" : {
"type" : "text",
"extensions" : [ "tbl" ],
"delimiter" : "|"
},
"csv" : {
"type" : "text",
"extensions" : [ "csv" ],
"delimiter" : ","
},
"tsv" : {
"type" : "text",
"extensions" : [ "tsv" ],
"delimiter" : "\t"
},
"parquet" : {
"type" : "parquet"
},
"json" : {
"type" : "json",
"extensions" : [ "json" ]
},
"avro" : {
"type" : "avro"
}
}
},
"format" : {
"type" : "json",
"extensions" : [ "json" ]
},
"columns" : [ "`channelid`", "`serverTime`" ],
"selectionRoot" :
"hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz",
"cost" : 1800981.0
}, {
"pop" : "hash-aggregate",
"@id" : 196610,
"child" : 196611,
"cardinality" : 1.0,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"groupByExprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
} ],
"aggrExprs" : [ {
"ref" : "`EXPR$1`",
"expr" : "count(`serverTime`) "
} ],
"cost" : 900490.5
}, {
"pop" : "project",
"@id" : 196609,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
}, {
"ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
"expr" : "hash32asdouble(`channelid`) "
} ],
"child" : 196610,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 180098.1
}, {
"pop" : "unordered-mux-exchange",
"@id" : 131073,
"child" : 196609,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 180098.1
}, {
"pop" : "hash-to-random-exchange",
"@id" : 65539,
"child" : 131073,
"expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 180098.1
}, {
"pop" : "project",
"@id" : 65538,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
} ],
"child" : 65539,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 180098.1
}, {
"pop" : "hash-aggregate",
"@id" : 65537,
"child" : 65538,
"cardinality" : 1.0,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"groupByExprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
} ],
"aggrExprs" : [ {
"ref" : "`EXPR$1`",
"expr" : "$sum0(`EXPR$1`) "
} ],
"cost" : 90049.05
}, {
"pop" : "union-exchange",
"@id" : 2,
"child" : 65537,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 18009.81
}, {
"pop" : "project",
"@id" : 1,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
} ],
"child" : 2,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 18009.81
}, {
"pop" : "screen",
"@id" : 0,
"child" : 1,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 18009.81
} ]
} |
+------+------+
1 row selected (0.729 seconds)
0: jdbc:drill:>
/******************************** Query on BZ
****************************************/
0: jdbc:drill:> explain plan for select channelid, count(serverTime) from
dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by channelid ;
+------+------+
| text | json |
+------+------+
| 00-00 Screen
00-01 Project(channelid=[$0], EXPR$1=[$1])
00-02 UnionExchange
01-01 HashAgg(group=[{0}], EXPR$1=[$SUM0($1)])
01-02 Project(channelid=[$0], EXPR$1=[$1])
01-03 HashToRandomExchange(dist0=[[$0]])
02-01 UnorderedMuxExchange
03-01 Project(channelid=[$0], EXPR$1=[$1],
E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)])
03-02 HashAgg(group=[{0}], EXPR$1=[COUNT($1)])
03-03 Scan(groupscan=[EasyGroupScan
[selectionRoot=hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2,
numFiles=1, columns=[`channelid`, `serverTime`],
files=[hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2]]])
| {
"head" : {
"version" : 1,
"generator" : {
"type" : "ExplainHandler",
"info" : ""
},
"type" : "APACHE_DRILL_PHYSICAL",
"options" : [ ],
"queue" : 0,
"resultMode" : "EXEC"
},
"graph" : [ {
"pop" : "fs-scan",
"@id" : 196611,
"userName" : "hadoop",
"files" : [
"hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2" ],
"storage" : {
"type" : "file",
"enabled" : true,
"connection" : "hdfs://namenode:9000",
"config" : null,
"workspaces" : {
"root" : {
"location" : "/tmp/",
"writable" : true,
"defaultInputFormat" : null
},
"tmp" : {
"location" : "/tmp",
"writable" : true,
"defaultInputFormat" : null
}
},
"formats" : {
"psv" : {
"type" : "text",
"extensions" : [ "tbl" ],
"delimiter" : "|"
},
"csv" : {
"type" : "text",
"extensions" : [ "csv" ],
"delimiter" : ","
},
"tsv" : {
"type" : "text",
"extensions" : [ "tsv" ],
"delimiter" : "\t"
},
"parquet" : {
"type" : "parquet"
},
"json" : {
"type" : "json",
"extensions" : [ "json" ]
},
"avro" : {
"type" : "avro"
}
}
},
"format" : {
"type" : "json",
"extensions" : [ "json" ]
},
"columns" : [ "`channelid`", "`serverTime`" ],
"selectionRoot" :
"hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2",
"cost" : 1148224.0
}, {
"pop" : "hash-aggregate",
"@id" : 196610,
"child" : 196611,
"cardinality" : 1.0,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"groupByExprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
} ],
"aggrExprs" : [ {
"ref" : "`EXPR$1`",
"expr" : "count(`serverTime`) "
} ],
"cost" : 574112.0
}, {
"pop" : "project",
"@id" : 196609,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
}, {
"ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
"expr" : "hash32asdouble(`channelid`) "
} ],
"child" : 196610,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 114822.4
}, {
"pop" : "unordered-mux-exchange",
"@id" : 131073,
"child" : 196609,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 114822.4
}, {
"pop" : "hash-to-random-exchange",
"@id" : 65539,
"child" : 131073,
"expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 114822.4
}, {
"pop" : "project",
"@id" : 65538,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
} ],
"child" : 65539,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 114822.4
}, {
"pop" : "hash-aggregate",
"@id" : 65537,
"child" : 65538,
"cardinality" : 1.0,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"groupByExprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
} ],
"aggrExprs" : [ {
"ref" : "`EXPR$1`",
"expr" : "$sum0(`EXPR$1`) "
} ],
"cost" : 57411.2
}, {
"pop" : "union-exchange",
"@id" : 2,
"child" : 65537,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 11482.24
}, {
"pop" : "project",
"@id" : 1,
"exprs" : [ {
"ref" : "`channelid`",
"expr" : "`channelid`"
}, {
"ref" : "`EXPR$1`",
"expr" : "`EXPR$1`"
} ],
"child" : 2,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 11482.24
}, {
"pop" : "screen",
"@id" : 0,
"child" : 1,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 11482.24
} ]
} |
+------+------+
1 row selected (0.381 seconds)
0: jdbc:drill:>
On Thu, Aug 4, 2016 at 3:07 PM, Khurram Faraaz <[email protected]> wrote:
> Can you please do an explain plan over the two aggregate queries. That way
> we can know where most of the time is being spent, is it in the query
> planning phase or is it query execution that is taking longer. Please share
> the query plans and the time taken for those explain plan statements.
>
> On Mon, Aug 1, 2016 at 3:46 PM, Shankar Mane <[email protected]>
> wrote:
>
> > It is plain json (1 json per line).
> > Each json message size = ~4kb
> > no. of json messages = ~5 Millions.
> >
> > store.parquet.compression = snappy ( i don't think, this parameter get
> > used. As I am querying select only.)
> >
> >
> > On Mon, Aug 1, 2016 at 3:27 PM, Khurram Faraaz <[email protected]>
> > wrote:
> >
> > > What is the data format within those .gz and .bz2 files ? It is parquet
> > or
> > > JSON or plain text (CSV) ?
> > > Also, what was this config parameter `store.parquet.compression` set
> to,
> > > when ypu ran your test ?
> > >
> > > - Khurram
> > >
> > > On Sun, Jul 31, 2016 at 11:17 PM, Shankar Mane <
> > [email protected]
> > > >
> > > wrote:
> > >
> > > > Awaiting for response..
> > > >
> > > > On 30-Jul-2016 3:20 PM, "Shankar Mane" <[email protected]>
> > > wrote:
> > > >
> > > > >
> > > >
> > > > > I am Comparing Querying speed between GZ and BZ2.
> > > > >
> > > > > Below are the 2 files and their sizes (This 2 files have same
> data):
> > > > > kafka_3_25-Jul-2016-12a.json.gz = 1.8G
> > > > > kafka_3_25-Jul-2016-12a.json.bz2= 1.1G
> > > > >
> > > > >
> > > > >
> > > > > Results:
> > > > >
> > > > > 0: jdbc:drill:> select channelid, count(serverTime) from
> > > > dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by
> channelid
> > ;
> > > > > +------------+----------+
> > > > > | channelid | EXPR$1 |
> > > > > +------------+----------+
> > > > > | 3 | 977134 |
> > > > > | 0 | 836850 |
> > > > > | 2 | 3202854 |
> > > > > +------------+----------+
> > > > > 3 rows selected (86.034 seconds)
> > > > >
> > > > >
> > > > >
> > > > > 0: jdbc:drill:> select channelid, count(serverTime) from
> > > > dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by
> > channelid
> > > ;
> > > > > +------------+----------+
> > > > > | channelid | EXPR$1 |
> > > > > +------------+----------+
> > > > > | 3 | 977134 |
> > > > > | 0 | 836850 |
> > > > > | 2 | 3202854 |
> > > > > +------------+----------+
> > > > > 3 rows selected (459.079 seconds)
> > > > >
> > > > >
> > > > >
> > > > > Questions:
> > > > > 1. As per above Test: Gz is 6x fast than Bz2. why is that ?
> > > > > 2. How can we speed to up Bz2. Are there any configuration to do ?
> > > > > 3. As bz2 is splittable format, How drill using it ?
> > > > >
> > > > >
> > > > > regards,
> > > > > shankar
> > > >
> > >
> >
>