[
https://issues.apache.org/jira/browse/DRILL-4378?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Jinfeng Ni reassigned DRILL-4378:
---------------------------------
Assignee: Jinfeng Ni
> CONVERT_FROM in View results in table scan of MapR-DB and perhaps HBASE
> -----------------------------------------------------------------------
>
> Key: DRILL-4378
> URL: https://issues.apache.org/jira/browse/DRILL-4378
> Project: Apache Drill
> Issue Type: Bug
> Components: Query Planning & Optimization, Storage - HBase
> Affects Versions: 1.4.0
> Reporter: John Omernik
> Assignee: Jinfeng Ni
>
> I created a view to avoid forcing users to write queries that always
> included the CONVERT_FROM statements. (I am a huge advocate of making things
> easy for the the users and writing queries with CONVERT_FROM statements isn't
> easy).
> I ran a query the other day on one of these views and noticed that a query
> that took 30 seconds really shouldn't take 30 seconds. What do I mean? well
> I wanted to get part of a record by looking up the MapR-DB Row key (equiv. to
> HBASE row key) That should be an instant lookup. Sure enough, when I tried
> it in the hbase shell that returns instantly. So why did Drill take 30
> seconds? I shot an email to Ted and Jim at MapR to ask this very question.
> Ted suggested that I try the query without a view. Sure enough, If I use the
> convert_from in a direct query, it's an instant (sub second) return. Thus it
> appears something in the view is not allowing the query to short circuit the
> read.
> Ted suggests I post here (I am curious if anyone who has HBASE setup is
> seeing this same issue with views) but also include the EXPLAIN plan.
> Basically, using my very limited ability to read EXPLAIN plans (If someone
> has a pointer to a blog post or docs on how to read EXPLAIN I would love
> that!) it looks like in the view the startRow and stopRow in the
> hbaseScanSpec are not set, seeming to cause a scan. Is there any away to
> assist the planner when running this through a view so that we can get the
> performance of the query without the view but with the easy of
> use/readability of using the view?
> Thanks!!!
> John
> View Creation
> CREATE VIEW view_testpaste as
> SELECT
> CONVERT_FROM(row_key, 'UTF8') AS pasteid,
> CONVERT_FROM(pastes.pdata.lang, 'UTF8') AS lang,
> CONVERT_FROM(pastes.raw.paste, 'UTF8') AS paste
> FROM dfs.`pastes`.`/pastes` pastes;
> Select from view takes 32 seconds (seems to be a scan)
> > select paste from view_testpaste where pasteid = 'djHEHcPM'
> 1 row selected (32.302 seconds)
> Just a direct select returns very fast (0.486 seconds)
> > select CONVERT_FROM(pastes.raw.paste, 'UTF8') AS paste
> FROM dfs.`pastes`.`/pastes` pastes where
> CONVERT_FROM(row_key, 'UTF8') = 'djHEHcPM';
> 1 row selected (0.486 seconds)
> EXPLAIN PLAN FOR select paste from view_testpaste where pasteid = 'djHEHcPM'
> +------+------+
> | text | json |
> +------+------+
> | 00-00 Screen
> 00-01 UnionExchange
> 01-01 Project(paste=[CONVERT_FROMUTF8($1)])
> 01-02 SelectionVectorRemover
> 01-03 Filter(condition=[=(CONVERT_FROMUTF8($0), 'djHEHcPM')])
> 01-04 Project(row_key=[$1], ITEM=[ITEM($0, 'paste')])
> 01-05 Scan(groupscan=[MapRDBGroupScan
> [HBaseScanSpec=HBaseScanSpec [tableName=maprfs:///data/pastebiner/pastes,
> startRow=null, stopRow=null, filter=null], columns=[`row_key`,
> `raw`.`paste`]]])
> | {
> "head" : {
> "version" : 1,
> "generator" : {
> "type" : "ExplainHandler",
> "info" : ""
> },
> "type" : "APACHE_DRILL_PHYSICAL",
> "options" : [ ],
> "queue" : 0,
> "resultMode" : "EXEC"
> },
> "graph" : [ {
> "pop" : "maprdb-scan",
> "@id" : 65541,
> "userName" : "darkness",
> "hbaseScanSpec" : {
> "tableName" : "maprfs:///data/pastebiner/pastes",
> "startRow" : "",
> "stopRow" : "",
> "serializedFilter" : null
> },
> "storage" : {
> "type" : "file",
> "enabled" : true,
> "connection" : "maprfs:///",
> "workspaces" : {
> "root" : {
> "location" : "/",
> "writable" : false,
> "defaultInputFormat" : null
> },
> "pastes" : {
> "location" : "/data/pastebiner",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "dev" : {
> "location" : "/data/dev",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "hive" : {
> "location" : "/user/hive",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "tmp" : {
> "location" : "/tmp",
> "writable" : true,
> "defaultInputFormat" : null
> }
> },
> "formats" : {
> "psv" : {
> "type" : "text",
> "extensions" : [ "tbl" ],
> "delimiter" : "|"
> },
> "csv" : {
> "type" : "text",
> "extensions" : [ "csv" ],
> "escape" : "`",
> "delimiter" : ","
> },
> "tsv" : {
> "type" : "text",
> "extensions" : [ "tsv" ],
> "delimiter" : "\t"
> },
> "parquet" : {
> "type" : "parquet"
> },
> "json" : {
> "type" : "json"
> },
> "maprdb" : {
> "type" : "maprdb"
> }
> }
> },
> "columns" : [ "`row_key`", "`raw`.`paste`" ],
> "cost" : 573950.0
> }, {
> "pop" : "project",
> "@id" : 65540,
> "exprs" : [ {
> "ref" : "`row_key`",
> "expr" : "`row_key`"
> }, {
> "ref" : "`ITEM`",
> "expr" : "`raw`.`paste`"
> } ],
> "child" : 65541,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 573950.0
> }, {
> "pop" : "filter",
> "@id" : 65539,
> "child" : 65540,
> "expr" : "equal(convert_fromutf8(`row_key`) , 'djHEHcPM') ",
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 86092.5
> }, {
> "pop" : "selection-vector-remover",
> "@id" : 65538,
> "child" : 65539,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 86092.5
> }, {
> "pop" : "project",
> "@id" : 65537,
> "exprs" : [ {
> "ref" : "`paste`",
> "expr" : "convert_fromutf8(`ITEM`) "
> } ],
> "child" : 65538,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 86092.5
> }, {
> "pop" : "union-exchange",
> "@id" : 1,
> "child" : 65537,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 86092.5
> }, {
> "pop" : "screen",
> "@id" : 0,
> "child" : 1,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 86092.5
> } ]
> } |
> +------+------+
> 1 row selected (0.42 seconds)
> EXPLAIN PLAN FOR select CONVERT_FROM(pastes.raw.paste, 'UTF8') AS paste
> FROM dfs.`pastes`.`/pastes` pastes where
> CONVERT_FROM(row_key, 'UTF8') = 'djHEHcPM';
> +------+------+
> | text | json |
> +------+------+
> | 00-00 Screen
> 00-01 Project(paste=[CONVERT_FROMUTF8($1)])
> 00-02 Project(row_key=[$1], ITEM=[ITEM($0, 'paste')])
> 00-03 Scan(groupscan=[MapRDBGroupScan [HBaseScanSpec=HBaseScanSpec
> [tableName=maprfs:///data/pastebiner/pastes, startRow=djHEHcPM,
> stopRow=djHEHcPM\x00, filter=RowFilter (EQUAL, djHEHcPM)],
> columns=[`row_key`, `raw`.`paste`]]])
> | {
> "head" : {
> "version" : 1,
> "generator" : {
> "type" : "ExplainHandler",
> "info" : ""
> },
> "type" : "APACHE_DRILL_PHYSICAL",
> "options" : [ ],
> "queue" : 0,
> "resultMode" : "EXEC"
> },
> "graph" : [ {
> "pop" : "maprdb-scan",
> "@id" : 3,
> "userName" : "darkness",
> "hbaseScanSpec" : {
> "tableName" : "maprfs:///data/pastebiner/pastes",
> "startRow" : "ZGpIRUhjUE0=",
> "stopRow" : "ZGpIRUhjUE0A",
> "serializedFilter" :
> "CihvcmcuYXBhY2hlLmhhZG9vcC5oYmFzZS5maWx0ZXIuUm93RmlsdGVyEkUKQwgCEj8KL29yZy5hcGFjaGUuaGFkb29wLmhiYXNlLmZpbHRlci5CaW5hcnlDb21wYXJhdG9yEgwKCgoIZGpIRUhjUE0="
> },
> "storage" : {
> "type" : "file",
> "enabled" : true,
> "connection" : "maprfs:///",
> "workspaces" : {
> "root" : {
> "location" : "/",
> "writable" : false,
> "defaultInputFormat" : null
> },
> "pastes" : {
> "location" : "/data/pastebiner",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "dev" : {
> "location" : "/data/dev",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "hive" : {
> "location" : "/user/hive",
> "writable" : true,
> "defaultInputFormat" : null
> },
> "tmp" : {
> "location" : "/tmp",
> "writable" : true,
> "defaultInputFormat" : null
> }
> },
> "formats" : {
> "psv" : {
> "type" : "text",
> "extensions" : [ "tbl" ],
> "delimiter" : "|"
> },
> "csv" : {
> "type" : "text",
> "extensions" : [ "csv" ],
> "escape" : "`",
> "delimiter" : ","
> },
> "tsv" : {
> "type" : "text",
> "extensions" : [ "tsv" ],
> "delimiter" : "\t"
> },
> "parquet" : {
> "type" : "parquet"
> },
> "json" : {
> "type" : "json"
> },
> "maprdb" : {
> "type" : "maprdb"
> }
> }
> },
> "columns" : [ "`row_key`", "`raw`.`paste`" ],
> "cost" : 286975.0
> }, {
> "pop" : "project",
> "@id" : 2,
> "exprs" : [ {
> "ref" : "`row_key`",
> "expr" : "`row_key`"
> }, {
> "ref" : "`ITEM`",
> "expr" : "`raw`.`paste`"
> } ],
> "child" : 3,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 286975.0
> }, {
> "pop" : "project",
> "@id" : 1,
> "exprs" : [ {
> "ref" : "`paste`",
> "expr" : "convert_fromutf8(`ITEM`) "
> } ],
> "child" : 2,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 286975.0
> }, {
> "pop" : "screen",
> "@id" : 0,
> "child" : 1,
> "initialAllocation" : 1000000,
> "maxAllocation" : 10000000000,
> "cost" : 286975.0
> } ]
> } |
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)