Stamatis Zampetakis created HIVE-29362:
------------------------------------------

             Summary: New configuration to display EXPLAIN FORMATTED in 
human-readable format
                 Key: HIVE-29362
                 URL: https://issues.apache.org/jira/browse/HIVE-29362
             Project: Hive
          Issue Type: Improvement
          Components: HiveServer2
    Affects Versions: 4.2.0
            Reporter: Stamatis Zampetakis
            Assignee: Stamatis Zampetakis


The EXPLAIN FORMATTED (introduced by HIVE-2546) returns a JSON output that is 
primarily meant to be consumed programmaticaly by other applications thus it is 
returned as a single line string with no indentation/formatting.

However, in some cases it is convenient to display the result in a 
human-readable format. For example, in the context of qtests its much easier 
for a contributor/reviewer to understand the output when it is properly 
formatted than when everything is packed into a single line.

In addition, having a properly formatted JSON file in git has multiple 
advantages:
 * less merge conflicts (since we don't always touch the same line)
 * cleaner history/blame (since we see exactly the part of the plan that is 
affected)
 * cleaner diff

Last but not least, a formatted JSON file significantly improves the runtime of 
tests utilizing the 
[QOutProcessor|https://github.com/apache/hive/blob/4c94300ccc9f4419967aeacfba22975440810cc3/itests/util/src/main/java/org/apache/hadoop/hive/ql/QOutProcessor.java].
 The QOutProcessor relies heavily on pattern matching and its very slow when 
input lines are large. Currently, running TestTezTPCDS30TBPerfCliDriver with 
query4.q (modified to EXPLAIN FORMATTED) takes ~100 seconds while if the JSON 
is formatted/multi-line the time goes down to ~20 seconds.

The goal is to add a new configuration property 
({{{}hive.explain.formatted.indent{}}}) for controlling the display format 
(machine vs. human readable) of {{EXPLAIN FORMATTED}} statement.
{code:sql}
CREATE TABLE person (id INT, fname STRING);
EXPLAIN FORMATTED SELECT fname FROM person WHERE id > 100
{code}
When its false (default value) we keep the current behavior and we return the 
JSON without any indentation.
{code:json}
{"CBOPlan":{"rels":[{"id":"0","relOp":"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan","table":["default","person"],"table:alias":"person","inputs":[],"rowCount":1,"avgRowSize":104,"rowType":{"fields":[{"type":"INTEGER","nullable":true,"name":"id"},{"type":"VARCHAR","nullable":true,"precision":2147483647,"name":"fname"},{"type":"BIGINT","nullable":true,"name":"BLOCK__OFFSET__INSIDE__FILE"},{"type":"VARCHAR","nullable":true,"precision":2147483647,"name":"INPUT__FILE__NAME"},{"fields":[{"type":"BIGINT","nullable":true,"name":"writeid"},{"type":"INTEGER","nullable":true,"name":"bucketid"},{"type":"BIGINT","nullable":true,"name":"rowid"}],"nullable":true,"name":"ROW__ID"},{"type":"BOOLEAN","nullable":true,"name":"ROW__IS__DELETED"}],"nullable":false},"colStats":[{"name":"id","ndv":1,"minValue":-2147483648,"maxValue":2147483647},{"name":"fname","ndv":1}]},{"id":"1","relOp":"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter","condition":{"op":{"name":">","kind":"GREATER_THAN","syntax":"BINARY"},"operands":[{"input":0,"name":"$0"},{"literal":100,"type":{"type":"INTEGER","nullable":false}}]},"rowCount":1},{"id":"2","relOp":"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject","fields":["fname"],"exprs":[{"input":1,"name":"$1"}],"rowCount":1}]},"optimizedSQL":"SELECT
 `fname`\nFROM `default`.`person`\nWHERE `id` > 100","cboInfo":"Plan optimized 
by CBO.","STAGE DEPENDENCIES":{"Stage-0":{"ROOT STAGE":"TRUE"}},"STAGE 
PLANS":{"Stage-0":{"Fetch Operator":{"limit:":"-1","Processor 
Tree:":{"TableScan":{"alias:":"person","columns:":["id","fname"],"database:":"default","filterExpr:":"(id
 > 100) (type: 
boolean)","table:":"person","isTempTable:":"false","OperatorId:":"TS_0","children":{"Filter
 Operator":{"predicate:":"(id > 100) (type: 
boolean)","OperatorId:":"FIL_4","children":{"Select 
Operator":{"expressions:":"fname (type: 
string)","columnExprMap:":{"_col0":"fname"},"outputColumnNames:":["_col0"],"OperatorId:":"SEL_2","children":{"ListSink":{"OperatorId:":"LIST_SINK_5"}}}}}}}}}}}}
{code}
When its true (true for all qtests) we return a human-readable JSON output, 
split in multiple lines with proper indentation.
{code:json}
{
  "CBOPlan": {
    "rels": [
      {
        "id": "0",
        "relOp": 
"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan",
        "table": [
          "default",
          "person"
        ],
        "table:alias": "person",
        "inputs": [],
        "rowCount": 1,
        "avgRowSize": 104,
        "rowType": {
          "fields": [
            {
              "type": "INTEGER",
              "nullable": true,
              "name": "id"
            },
            {
              "type": "VARCHAR",
              "nullable": true,
              "precision": 2147483647,
              "name": "fname"
            },
            {
              "type": "BIGINT",
              "nullable": true,
              "name": "BLOCK__OFFSET__INSIDE__FILE"
            },
            {
              "type": "VARCHAR",
              "nullable": true,
              "precision": 2147483647,
              "name": "INPUT__FILE__NAME"
            },
            {
              "fields": [
                {
                  "type": "BIGINT",
                  "nullable": true,
                  "name": "writeid"
                },
                {
                  "type": "INTEGER",
                  "nullable": true,
                  "name": "bucketid"
                },
                {
                  "type": "BIGINT",
                  "nullable": true,
                  "name": "rowid"
                }
              ],
              "nullable": true,
              "name": "ROW__ID"
            },
            {
              "type": "BOOLEAN",
              "nullable": true,
              "name": "ROW__IS__DELETED"
            }
          ],
          "nullable": false
        },
        "colStats": [
          {
            "name": "id",
            "ndv": 1,
            "minValue": -2147483648,
            "maxValue": 2147483647
          },
          {
            "name": "fname",
            "ndv": 1
          }
        ]
      },
      {
        "id": "1",
        "relOp": 
"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter",
        "condition": {
          "op": {
            "name": ">",
            "kind": "GREATER_THAN",
            "syntax": "BINARY"
          },
          "operands": [
            {
              "input": 0,
              "name": "$0"
            },
            {
              "literal": 100,
              "type": {
                "type": "INTEGER",
                "nullable": false
              }
            }
          ]
        },
        "rowCount": 1
      },
      {
        "id": "2",
        "relOp": 
"org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject",
        "fields": [
          "fname"
        ],
        "exprs": [
          {
            "input": 1,
            "name": "$1"
          }
        ],
        "rowCount": 1
      }
    ]
  },
  "optimizedSQL": "SELECT `fname`\nFROM `default`.`person`\nWHERE `id` > 100",
  "cboInfo": "Plan optimized by CBO.",
  "STAGE DEPENDENCIES": {
    "Stage-0": {
      "ROOT STAGE": "TRUE"
    }
  },
  "STAGE PLANS": {
    "Stage-0": {
      "Fetch Operator": {
        "limit:": "-1",
        "Processor Tree:": {
          "TableScan": {
            "alias:": "person",
            "columns:": [
              "id",
              "fname"
            ],
            "database:": "default",
            "filterExpr:": "(id > 100) (type: boolean)",
            "table:": "person",
            "isTempTable:": "false",
            "OperatorId:": "TS_0",
            "children": {
              "Filter Operator": {
                "predicate:": "(id > 100) (type: boolean)",
                "OperatorId:": "FIL_4",
                "children": {
                  "Select Operator": {
                    "expressions:": "fname (type: string)",
                    "columnExprMap:": {
                      "_col0": "fname"
                    },
                    "outputColumnNames:": [
                      "_col0"
                    ],
                    "OperatorId:": "SEL_2",
                    "children": {
                      "ListSink": {
                        "OperatorId:": "LIST_SINK_5"
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}
{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to