Re: [Drill-Questions] Speed difference between GZ and BZ2

Shankar Mane Thu, 04 Aug 2016 04:44:12 -0700

Please find the query plan for both queries. FYI: I am not seeing
any planning difference between these 2 queries except Cost.



/******************************** Query on GZ
****************************************/

0: jdbc:drill:> explain plan for select channelid, count(serverTime) from
dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by channelid ;
+------+------+
| text | json |
+------+------+
| 00-00    Screen
00-01      Project(channelid=[$0], EXPR$1=[$1])
00-02        UnionExchange
01-01          HashAgg(group=[{0}], EXPR$1=[$SUM0($1)])
01-02            Project(channelid=[$0], EXPR$1=[$1])
01-03              HashToRandomExchange(dist0=[[$0]])
02-01                UnorderedMuxExchange
03-01                  Project(channelid=[$0], EXPR$1=[$1],
E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)])
03-02                    HashAgg(group=[{0}], EXPR$1=[COUNT($1)])
03-03                      Scan(groupscan=[EasyGroupScan
[selectionRoot=hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz,
numFiles=1, columns=[`channelid`, `serverTime`],
files=[hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz]]])
 | {
  "head" : {
    "version" : 1,
    "generator" : {
      "type" : "ExplainHandler",
      "info" : ""
    },
    "type" : "APACHE_DRILL_PHYSICAL",
    "options" : [ ],
    "queue" : 0,
    "resultMode" : "EXEC"
  },
  "graph" : [ {
    "pop" : "fs-scan",
    "@id" : 196611,
    "userName" : "hadoop",
    "files" : [
"hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz" ],
    "storage" : {
      "type" : "file",
      "enabled" : true,
      "connection" : "hdfs://namenode:9000",
      "config" : null,
      "workspaces" : {
        "root" : {
          "location" : "/tmp/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "tmp" : {
          "location" : "/tmp",
          "writable" : true,
          "defaultInputFormat" : null
        }
      },
      "formats" : {
        "psv" : {
          "type" : "text",
          "extensions" : [ "tbl" ],
          "delimiter" : "|"
        },
        "csv" : {
          "type" : "text",
          "extensions" : [ "csv" ],
          "delimiter" : ","
        },
        "tsv" : {
          "type" : "text",
          "extensions" : [ "tsv" ],
          "delimiter" : "\t"
        },
        "parquet" : {
          "type" : "parquet"
        },
        "json" : {
          "type" : "json",
          "extensions" : [ "json" ]
        },
        "avro" : {
          "type" : "avro"
        }
      }
    },
    "format" : {
      "type" : "json",
      "extensions" : [ "json" ]
    },
    "columns" : [ "`channelid`", "`serverTime`" ],
    "selectionRoot" :
"hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz",
    "cost" : 1800981.0
  }, {
    "pop" : "hash-aggregate",
    "@id" : 196610,
    "child" : 196611,
    "cardinality" : 1.0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "groupByExprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`EXPR$1`",
      "expr" : "count(`serverTime`) "
    } ],
    "cost" : 900490.5
  }, {
    "pop" : "project",
    "@id" : 196609,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    }, {
      "ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
      "expr" : "hash32asdouble(`channelid`) "
    } ],
    "child" : 196610,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 180098.1
  }, {
    "pop" : "unordered-mux-exchange",
    "@id" : 131073,
    "child" : 196609,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 180098.1
  }, {
    "pop" : "hash-to-random-exchange",
    "@id" : 65539,
    "child" : 131073,
    "expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 180098.1
  }, {
    "pop" : "project",
    "@id" : 65538,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    } ],
    "child" : 65539,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 180098.1
  }, {
    "pop" : "hash-aggregate",
    "@id" : 65537,
    "child" : 65538,
    "cardinality" : 1.0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "groupByExprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`EXPR$1`",
      "expr" : "$sum0(`EXPR$1`) "
    } ],
    "cost" : 90049.05
  }, {
    "pop" : "union-exchange",
    "@id" : 2,
    "child" : 65537,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 18009.81
  }, {
    "pop" : "project",
    "@id" : 1,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    } ],
    "child" : 2,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 18009.81
  }, {
    "pop" : "screen",
    "@id" : 0,
    "child" : 1,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 18009.81
  } ]
} |
+------+------+
1 row selected (0.729 seconds)
0: jdbc:drill:>



/******************************** Query on BZ
****************************************/


0: jdbc:drill:> explain plan for select channelid, count(serverTime) from
dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by channelid ;
+------+------+
| text | json |
+------+------+
| 00-00    Screen
00-01      Project(channelid=[$0], EXPR$1=[$1])
00-02        UnionExchange
01-01          HashAgg(group=[{0}], EXPR$1=[$SUM0($1)])
01-02            Project(channelid=[$0], EXPR$1=[$1])
01-03              HashToRandomExchange(dist0=[[$0]])
02-01                UnorderedMuxExchange
03-01                  Project(channelid=[$0], EXPR$1=[$1],
E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)])
03-02                    HashAgg(group=[{0}], EXPR$1=[COUNT($1)])
03-03                      Scan(groupscan=[EasyGroupScan
[selectionRoot=hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2,
numFiles=1, columns=[`channelid`, `serverTime`],
files=[hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2]]])
 | {
  "head" : {
    "version" : 1,
    "generator" : {
      "type" : "ExplainHandler",
      "info" : ""
    },
    "type" : "APACHE_DRILL_PHYSICAL",
    "options" : [ ],
    "queue" : 0,
    "resultMode" : "EXEC"
  },
  "graph" : [ {
    "pop" : "fs-scan",
    "@id" : 196611,
    "userName" : "hadoop",
    "files" : [
"hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2" ],
    "storage" : {
      "type" : "file",
      "enabled" : true,
      "connection" : "hdfs://namenode:9000",
      "config" : null,
      "workspaces" : {
        "root" : {
          "location" : "/tmp/",
          "writable" : true,
          "defaultInputFormat" : null
        },
        "tmp" : {
          "location" : "/tmp",
          "writable" : true,
          "defaultInputFormat" : null
        }
      },
      "formats" : {
        "psv" : {
          "type" : "text",
          "extensions" : [ "tbl" ],
          "delimiter" : "|"
        },
        "csv" : {
          "type" : "text",
          "extensions" : [ "csv" ],
          "delimiter" : ","
        },
        "tsv" : {
          "type" : "text",
          "extensions" : [ "tsv" ],
          "delimiter" : "\t"
        },
        "parquet" : {
          "type" : "parquet"
        },
        "json" : {
          "type" : "json",
          "extensions" : [ "json" ]
        },
        "avro" : {
          "type" : "avro"
        }
      }
    },
    "format" : {
      "type" : "json",
      "extensions" : [ "json" ]
    },
    "columns" : [ "`channelid`", "`serverTime`" ],
    "selectionRoot" :
"hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2",
    "cost" : 1148224.0
  }, {
    "pop" : "hash-aggregate",
    "@id" : 196610,
    "child" : 196611,
    "cardinality" : 1.0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "groupByExprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`EXPR$1`",
      "expr" : "count(`serverTime`) "
    } ],
    "cost" : 574112.0
  }, {
    "pop" : "project",
    "@id" : 196609,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    }, {
      "ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
      "expr" : "hash32asdouble(`channelid`) "
    } ],
    "child" : 196610,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 114822.4
  }, {
    "pop" : "unordered-mux-exchange",
    "@id" : 131073,
    "child" : 196609,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 114822.4
  }, {
    "pop" : "hash-to-random-exchange",
    "@id" : 65539,
    "child" : 131073,
    "expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 114822.4
  }, {
    "pop" : "project",
    "@id" : 65538,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    } ],
    "child" : 65539,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 114822.4
  }, {
    "pop" : "hash-aggregate",
    "@id" : 65537,
    "child" : 65538,
    "cardinality" : 1.0,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "groupByExprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`EXPR$1`",
      "expr" : "$sum0(`EXPR$1`) "
    } ],
    "cost" : 57411.2
  }, {
    "pop" : "union-exchange",
    "@id" : 2,
    "child" : 65537,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 11482.24
  }, {
    "pop" : "project",
    "@id" : 1,
    "exprs" : [ {
      "ref" : "`channelid`",
      "expr" : "`channelid`"
    }, {
      "ref" : "`EXPR$1`",
      "expr" : "`EXPR$1`"
    } ],
    "child" : 2,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 11482.24
  }, {
    "pop" : "screen",
    "@id" : 0,
    "child" : 1,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 11482.24
  } ]
} |
+------+------+
1 row selected (0.381 seconds)
0: jdbc:drill:>


On Thu, Aug 4, 2016 at 3:07 PM, Khurram Faraaz <[email protected]> wrote:

> Can you please do an explain plan over the two aggregate queries. That way
> we can know where most of the time is being spent, is it in the query
> planning phase or is it query execution that is taking longer. Please share
> the query plans and the time taken for those explain plan statements.
>
> On Mon, Aug 1, 2016 at 3:46 PM, Shankar Mane <[email protected]>
> wrote:
>
> > It is plain json (1 json per line).
> > Each json message size = ~4kb
> > no. of json messages = ~5 Millions.
> >
> > store.parquet.compression = snappy ( i don't think, this parameter get
> > used. As I am querying select only.)
> >
> >
> > On Mon, Aug 1, 2016 at 3:27 PM, Khurram Faraaz <[email protected]>
> > wrote:
> >
> > > What is the data format within those .gz and .bz2 files ? It is parquet
> > or
> > > JSON or plain text (CSV) ?
> > > Also, what was this config parameter `store.parquet.compression` set
> to,
> > > when ypu ran your test ?
> > >
> > > - Khurram
> > >
> > > On Sun, Jul 31, 2016 at 11:17 PM, Shankar Mane <
> > [email protected]
> > > >
> > > wrote:
> > >
> > > > Awaiting for response..
> > > >
> > > > On 30-Jul-2016 3:20 PM, "Shankar Mane" <[email protected]>
> > > wrote:
> > > >
> > > > >
> > > >
> > > > > I am Comparing Querying speed between GZ and BZ2.
> > > > >
> > > > > Below are the 2 files and their sizes (This 2 files have same
> data):
> > > > > kafka_3_25-Jul-2016-12a.json.gz = 1.8G
> > > > > kafka_3_25-Jul-2016-12a.json.bz2= 1.1G
> > > > >
> > > > >
> > > > >
> > > > > Results:
> > > > >
> > > > > 0: jdbc:drill:> select channelid, count(serverTime) from
> > > > dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by
> channelid
> > ;
> > > > > +------------+----------+
> > > > > | channelid  |  EXPR$1  |
> > > > > +------------+----------+
> > > > > | 3          | 977134   |
> > > > > | 0          | 836850   |
> > > > > | 2          | 3202854  |
> > > > > +------------+----------+
> > > > > 3 rows selected (86.034 seconds)
> > > > >
> > > > >
> > > > >
> > > > > 0: jdbc:drill:> select channelid, count(serverTime) from
> > > > dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by
> > channelid
> > > ;
> > > > > +------------+----------+
> > > > > | channelid  |  EXPR$1  |
> > > > > +------------+----------+
> > > > > | 3          | 977134   |
> > > > > | 0          | 836850   |
> > > > > | 2          | 3202854  |
> > > > > +------------+----------+
> > > > > 3 rows selected (459.079 seconds)
> > > > >
> > > > >
> > > > >
> > > > > Questions:
> > > > > 1. As per above Test: Gz is 6x fast than Bz2. why is that ?
> > > > > 2. How can we speed to up Bz2.  Are there any configuration to do ?
> > > > > 3. As bz2 is splittable format, How drill using it ?
> > > > >
> > > > >
> > > > > regards,
> > > > > shankar
> > > >
> > >
> >
>

Re: [Drill-Questions] Speed difference between GZ and BZ2

Reply via email to