Repository: hive Updated Branches: refs/heads/master bd8269609 -> 86755f6da
HIVE-17253: Adding SUMMARY statement to HPL/SQL (Dmitry Tolpeko, reviewed by Alan Gates) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/86755f6d Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/86755f6d Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/86755f6d Branch: refs/heads/master Commit: 86755f6da9aaf757609866c170fadbc3a0767140 Parents: bd82696 Author: Dmitry Tolpeko <[email protected]> Authored: Sun Sep 24 19:54:59 2017 +0000 Committer: Dmitry Tolpeko <[email protected]> Committed: Sun Sep 24 19:56:54 2017 +0000 ---------------------------------------------------------------------- .../antlr4/org/apache/hive/hplsql/Hplsql.g4 | 7 + .../main/java/org/apache/hive/hplsql/Exec.java | 8 + .../main/java/org/apache/hive/hplsql/Meta.java | 71 ++++- .../main/java/org/apache/hive/hplsql/Row.java | 7 + .../main/java/org/apache/hive/hplsql/Stmt.java | 281 +++++++++++++++++++ hplsql/src/main/resources/hplsql-site.xml | 2 - hplsql/src/test/queries/db/summary.sql | 15 + hplsql/src/test/results/db/summary.out.txt | 47 ++++ 8 files changed, 435 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4 ---------------------------------------------------------------------- diff --git a/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4 b/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4 index 8a351bd..eae45c5 100644 --- a/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4 +++ b/hplsql/src/main/antlr4/org/apache/hive/hplsql/Hplsql.g4 @@ -91,6 +91,7 @@ stmt : | rollback_stmt | select_stmt | signal_stmt + | summary_stmt | update_stmt | use_stmt | truncate_stmt @@ -733,6 +734,10 @@ set_teradata_session_option : signal_stmt : // SIGNAL statement T_SIGNAL ident ; + +summary_stmt : // SUMMARY statement + T_SUMMARY (T_TOP expr)? T_FOR (select_stmt | table_name where_clause? (T_LIMIT expr)?) + ; truncate_stmt : T_TRUNCATE T_TABLE? table_name @@ -1506,6 +1511,7 @@ non_reserved_words : // Tokens that are not reserved words | T_SUBDIR | T_SUBSTRING | T_SUM + | T_SUMMARY | T_SYSDATE | T_SYS_REFCURSOR | T_TABLE @@ -1825,6 +1831,7 @@ T_STRING : S T R I N G ; T_SUBDIR : S U B D I R ; T_SUBSTRING : S U B S T R I N G ; T_SUM : S U M ; +T_SUMMARY : S U M M A R Y ; T_SYS_REFCURSOR : S Y S '_' R E F C U R S O R ; T_TABLE : T A B L E ; T_TABLESPACE : T A B L E S P A C E ; http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/java/org/apache/hive/hplsql/Exec.java ---------------------------------------------------------------------- diff --git a/hplsql/src/main/java/org/apache/hive/hplsql/Exec.java b/hplsql/src/main/java/org/apache/hive/hplsql/Exec.java index 2ad3ea3..fdc75d5 100644 --- a/hplsql/src/main/java/org/apache/hive/hplsql/Exec.java +++ b/hplsql/src/main/java/org/apache/hive/hplsql/Exec.java @@ -1846,6 +1846,14 @@ public class Exec extends HplsqlBaseVisitor<Integer> { @Override public Integer visitSignal_stmt(HplsqlParser.Signal_stmtContext ctx) { return exec.stmt.signal(ctx); + } + + /** + * SUMMARY statement + */ + @Override + public Integer visitSummary_stmt(HplsqlParser.Summary_stmtContext ctx) { + return exec.stmt.summary(ctx); } /** http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/java/org/apache/hive/hplsql/Meta.java ---------------------------------------------------------------------- diff --git a/hplsql/src/main/java/org/apache/hive/hplsql/Meta.java b/hplsql/src/main/java/org/apache/hive/hplsql/Meta.java index 46bd55a..e9e02a2 100644 --- a/hplsql/src/main/java/org/apache/hive/hplsql/Meta.java +++ b/hplsql/src/main/java/org/apache/hive/hplsql/Meta.java @@ -88,6 +88,66 @@ public class Meta { } /** + * Get data types for all columns of the SELECT statement + */ + Row getRowDataTypeForSelect(ParserRuleContext ctx, String conn, String select) { + Row row = null; + Conn.Type connType = exec.getConnectionType(conn); + // Hive does not support ResultSetMetaData on PreparedStatement, and Hive DESCRIBE + // does not support queries, so we have to execute the query with LIMIT 1 + if (connType == Conn.Type.HIVE) { + String sql = "SELECT * FROM (" + select + ") t LIMIT 1"; + Query query = new Query(sql); + exec.executeQuery(ctx, query, conn); + if (!query.error()) { + ResultSet rs = query.getResultSet(); + try { + ResultSetMetaData rm = rs.getMetaData(); + int cols = rm.getColumnCount(); + row = new Row(); + for (int i = 1; i <= cols; i++) { + String name = rm.getColumnName(i); + if (name.startsWith("t.")) { + name = name.substring(2); + } + row.addColumn(name, rm.getColumnTypeName(i)); + } + } + catch (Exception e) { + exec.signal(e); + } + } + else { + exec.signal(query.getException()); + } + exec.closeQuery(query, conn); + } + else { + Query query = exec.prepareQuery(ctx, select, conn); + if (!query.error()) { + try { + PreparedStatement stmt = query.getPreparedStatement(); + ResultSetMetaData rm = stmt.getMetaData(); + int cols = rm.getColumnCount(); + for (int i = 1; i <= cols; i++) { + String col = rm.getColumnName(i); + String typ = rm.getColumnTypeName(i); + if (row == null) { + row = new Row(); + } + row.addColumn(col.toUpperCase(), typ); + } + } + catch (Exception e) { + exec.signal(e); + } + } + exec.closeQuery(query, conn); + } + return row; + } + + /** * Read the column data from the database and cache it */ Row readColumns(ParserRuleContext ctx, String conn, String table, HashMap<String, Row> map) { @@ -106,11 +166,20 @@ public class Meta { if (row == null) { row = new Row(); } + // Hive DESCRIBE outputs "empty_string NULL" row before partition information + if (typ == null) { + break; + } row.addColumn(col.toUpperCase(), typ); } map.put(table, row); } - catch (Exception e) {} + catch (Exception e) { + exec.signal(e); + } + } + else { + exec.signal(query.getException()); } exec.closeQuery(query, conn); } http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/java/org/apache/hive/hplsql/Row.java ---------------------------------------------------------------------- diff --git a/hplsql/src/main/java/org/apache/hive/hplsql/Row.java b/hplsql/src/main/java/org/apache/hive/hplsql/Row.java index 91392c7..a9d6c22 100644 --- a/hplsql/src/main/java/org/apache/hive/hplsql/Row.java +++ b/hplsql/src/main/java/org/apache/hive/hplsql/Row.java @@ -86,6 +86,13 @@ public class Row { } /** + * Get column by index + */ + Column getColumn(int i) { + return columns.get(i); + } + + /** * Get the number of columns */ int size() { http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/java/org/apache/hive/hplsql/Stmt.java ---------------------------------------------------------------------- diff --git a/hplsql/src/main/java/org/apache/hive/hplsql/Stmt.java b/hplsql/src/main/java/org/apache/hive/hplsql/Stmt.java index 0fafd59..8c541b3 100644 --- a/hplsql/src/main/java/org/apache/hive/hplsql/Stmt.java +++ b/hplsql/src/main/java/org/apache/hive/hplsql/Stmt.java @@ -21,6 +21,7 @@ package org.apache.hive.hplsql; import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.SQLException; +import java.util.ArrayList; import java.util.Stack; import java.util.UUID; @@ -1242,7 +1243,287 @@ public class Stmt { exec.signal(signal); return 0; } + + /** + * SUMMARY statement + */ + public Integer summary(HplsqlParser.Summary_stmtContext ctx) { + trace(ctx, "SUMMARY"); + String table = null; + String select = null; + String conn = null; + Row row = null; + if (ctx.table_name() != null) { + table = evalPop(ctx.table_name()).toString(); + conn = exec.getObjectConnection(table); + row = meta.getRowDataType(ctx, conn, table); + } + else { + select = evalPop(ctx.select_stmt()).toString(); + conn = exec.getStatementConnection(); + row = meta.getRowDataTypeForSelect(ctx, conn, select); + } + if (row == null) { + return 1; + } + Conn.Type connType = exec.getConnectionType(conn); + if (ctx.T_TOP() == null) { + return summaryStat(ctx, table, select, row, conn, connType); + } + else { + return summaryTop(ctx, table, select, row, conn, connType); + } + } + // Summary for column statistics + public Integer summaryStat(HplsqlParser.Summary_stmtContext ctx, String table, String select, + Row row, String conn, Conn.Type connType) { + StringBuilder sql = new StringBuilder("SELECT COUNT(*)"); + int maxColName = 11; + // Define summary metrics for each column + for(Column c : row.getColumns()) { + String col = c.getName(); + if (connType == Conn.Type.HIVE) { + col = '`' + col + '`'; + } + sql.append(",COUNT(" + col + "),"); + sql.append("COUNT(DISTINCT " + col + "),"); + sql.append("AVG(" + col + "),"); + sql.append("MIN(" + col + "),"); + sql.append("MAX(" + col + "),"); + sql.append("STDDEV_SAMP(" + col + "),"); + sql.append("PERCENTILE_APPROX(CAST(" + col + " AS DOUBLE),0.05),"); + sql.append("PERCENTILE_APPROX(CAST(" + col + " AS DOUBLE),0.25),"); + sql.append("PERCENTILE_APPROX(CAST(" + col + " AS DOUBLE),0.5),"); + sql.append("PERCENTILE_APPROX(CAST(" + col + " AS DOUBLE),0.75),"); + sql.append("PERCENTILE_APPROX(CAST(" + col + " AS DOUBLE),0.95)"); + if (col.length() > maxColName) { + maxColName = col.length(); + } + } + if (table != null) { + sql.append(" FROM (SELECT * FROM " + table); + if (ctx.where_clause() != null) { + sql.append(" " + evalPop(ctx.where_clause()).toString()); + } + if (ctx.T_LIMIT() != null) { + sql.append(" LIMIT "); + int limExp = 0; + if (ctx.T_TOP() != null) { + limExp = 1; + } + sql.append(evalPop(ctx.expr(limExp)).toString()); + } + sql.append(") t"); + } + else { + sql.append(" FROM (" + select + ") t"); + } + Query query = exec.executeQuery(ctx, sql.toString(), conn); + if (query.error()) { + exec.signal(query); + return 1; + } + exec.setSqlSuccess(); + try { + ResultSet rs = query.getResultSet(); + if (rs != null) { + System.out.print("\n"); + // The summary query returns only one row + if (rs.next()) { + int i = 0, cc = 11; + String cntRows = rs.getString(1); + // Pad output + String fmt = String.format("%%-%ds\t%%-11s\t%%-11s\t%%-11s\t%%-11s\t%%-11s\t%%-11s\t%%-11s\t%%-11s" + + "\t%%-11s\t%%-11s\t%%-11s\t%%-11s\t%%-11s\n", maxColName + 1); + System.out.print(String.format(fmt, "Column", "Type", "Rows", "NonNull", "Unique", "Avg", + "Min", "Max", "StdDev", "p05", "p25", "p50", "p75", "p95")); + for(Column c : row.getColumns()) { + String avg = String.format("%.2f", rs.getDouble(4 + i*cc)); + if (rs.wasNull()) + avg = "null"; + String stddev = String.format("%.2f", rs.getDouble(7 + i*cc)); + if (rs.wasNull()) + stddev = "null"; + String p05 = String.format("%.2f", rs.getDouble(8 + i*cc)); + if (rs.wasNull()) + p05 = "null"; + String p25 = String.format("%.2f", rs.getDouble(9 + i*cc)); + if (rs.wasNull()) + p25 = "null"; + String p50 = String.format("%.2f", rs.getDouble(10 + i*cc)); + if (rs.wasNull()) + p50 = "null"; + String p75 = String.format("%.2f", rs.getDouble(11 + i*cc)); + if (rs.wasNull()) + p75 = "null"; + String p95 = String.format("%.2f", rs.getDouble(12 + i*cc)); + if (rs.wasNull()) + p95 = "null"; + System.out.print(String.format(fmt, c.getName(), c.getType(), cntRows, rs.getString(2 + i*cc), + rs.getString(3 + i*cc), avg, rs.getString(5 + i*cc), rs.getString(6 + i*cc), + stddev, p05, p25, p50, p75, p95)); + i++; + } + } + } + } + catch (SQLException e) { + exec.signal(e); + exec.closeQuery(query, conn); + return 1; + } + exec.closeQuery(query, conn); + return 0; + } + + // Summary for top column values + public Integer summaryTop(HplsqlParser.Summary_stmtContext ctx, String table, String select, + Row row, String conn, Conn.Type connType) { + StringBuilder sql = new StringBuilder("SELECT id, col, cnt FROM (" + + "SELECT id, col, cnt, ROW_NUMBER() OVER (PARTITION BY id ORDER BY cnt DESC) rn " + + "FROM (SELECT CAST(GROUPING__ID AS DECIMAL) id, COALESCE("); // CAST AS INT does not work as expected (ID is still considered as STRING in ORDER BY for some reason) + int topNum = evalPop(ctx.expr(0)).intValue(); + StringBuilder colsList = new StringBuilder(); + StringBuilder colsGrList = new StringBuilder(); + int i = 0; + for(Column c : row.getColumns()) { + String col = c.getName(); + if (connType == Conn.Type.HIVE) { + col = '`' + col + '`'; + } + if (i != 0) { + colsList.append(","); + colsGrList.append(","); + } + colsList.append(col); + colsGrList.append("(" + col + ")"); + i++; + } + sql.append(colsList); + sql.append(") col, COUNT(*) cnt"); + if (table != null) { + sql.append(" FROM (SELECT * FROM " + table); + if (ctx.where_clause() != null) { + sql.append(" " + evalPop(ctx.where_clause()).toString()); + } + if (ctx.T_LIMIT() != null) { + sql.append(" LIMIT " + evalPop(ctx.expr(1)).toString()); + } + sql.append(") t"); + } + else { + sql.append(" FROM (" + select + ") t"); + } + sql.append(" GROUP BY "); + sql.append(colsList); + sql.append(" GROUPING SETS ("); + sql.append(colsGrList); + sql.append(")) t) t WHERE rn <= " + topNum + " ORDER BY id, cnt DESC"); + // Add LIMIT as Order by-s without limit can disabled for safety reasons + sql.append(" LIMIT " + topNum * row.size()); + Query query = exec.executeQuery(ctx, sql.toString(), conn); + if (query.error()) { + exec.signal(query); + return 1; + } + exec.setSqlSuccess(); + try { + ResultSet rs = query.getResultSet(); + if (rs != null) { + int prevId = -1; + int grRow = 0; + int colNum = 0; + int maxLen = row.getColumn(colNum).getName().length(); + ArrayList<String> outCols = new ArrayList<String>(); + ArrayList<Integer> outCnts = new ArrayList<Integer>(); + ArrayList<Integer> outLens = new ArrayList<Integer>(); + while (rs.next()) { + int id = rs.getInt(1); + String value = rs.getString(2); + int cnt = rs.getInt(3); + if (prevId == -1) { + prevId = id; + } + // Still the same column + if (id == prevId) { + outCols.add(value); + outCnts.add(cnt); + if (value != null && value.length() > maxLen) { + maxLen = value.length() < 300 ? value.length() : 300; + } + grRow++; + } + // First value for next column + else { + // Pad with empty rows if the number of values in group is less than TOP num + for (int j = grRow; j < topNum; j++) { + outCols.add(""); + outCnts.add(0); + grRow++; + } + outCols.add(value); + outCnts.add(cnt); + outLens.add(maxLen); + colNum++; + maxLen = row.getColumn(colNum).getName().length(); + if (value != null && value.length() > maxLen) { + maxLen = value.length() < 300 ? value.length() : 300; + } + grRow = 1; + prevId = id; + } + } + for (int j = grRow; j < topNum; j++) { + outCols.add(""); + outCnts.add(0); + grRow++; + } + if (maxLen != 0) { + outLens.add(maxLen); + } + System.out.print("\n"); + // Output header + i = 0; + for(Column c : row.getColumns()) { + if (i != 0) { + System.out.print("\t"); + } + String fmt = String.format("%%-%ds", outLens.get(i) + 11 + 3); + System.out.print(String.format(fmt, c.getName())); + i++; + } + System.out.print("\n"); + // Output top values + for (int j = 0; j < topNum; j++) { + for(int k = 0; k < row.size(); k++) { + if (k != 0) { + System.out.print("\t"); + } + int cnt = outCnts.get(j + k * topNum); + if (cnt != 0) { // skip padded values + String fmt = String.format("%%-%ds", outLens.get(k)); + System.out.print(String.format(fmt, outCols.get(j + k * topNum))); + System.out.print(String.format(" %-11d", cnt)); + } + else { + String fmt = String.format("%%-%ds", outLens.get(k) + 11 + 3); + System.out.print(String.format(fmt, "")); + } + } + System.out.print("\n"); + } + } + } + catch (SQLException e) { + exec.signal(e); + exec.closeQuery(query, conn); + return 1; + } + exec.closeQuery(query, conn); + return 0; + } + /** * RESIGNAL statement */ http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/main/resources/hplsql-site.xml ---------------------------------------------------------------------- diff --git a/hplsql/src/main/resources/hplsql-site.xml b/hplsql/src/main/resources/hplsql-site.xml index 96843dc..aca54e2 100644 --- a/hplsql/src/main/resources/hplsql-site.xml +++ b/hplsql/src/main/resources/hplsql-site.xml @@ -35,8 +35,6 @@ <property> <name>hplsql.conn.init.hive2conn</name> <value> - set hive.execution.engine=mr; - use default; </value> <description>Statements for execute after connection to the database</description> </property> http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/test/queries/db/summary.sql ---------------------------------------------------------------------- diff --git a/hplsql/src/test/queries/db/summary.sql b/hplsql/src/test/queries/db/summary.sql new file mode 100644 index 0000000..20b04bd --- /dev/null +++ b/hplsql/src/test/queries/db/summary.sql @@ -0,0 +1,15 @@ +summary for src; + +summary for src where value like 'val%'; + +summary for src limit 100; + +summary for sample_07; + +summary for select code, total_emp, salary from sample_07; + +summary top 4 for src; + +summary top 3 for sample_07; + +summary top 3 for select * from sample_07; http://git-wip-us.apache.org/repos/asf/hive/blob/86755f6d/hplsql/src/test/results/db/summary.out.txt ---------------------------------------------------------------------- diff --git a/hplsql/src/test/results/db/summary.out.txt b/hplsql/src/test/results/db/summary.out.txt new file mode 100644 index 0000000..5967da4 --- /dev/null +++ b/hplsql/src/test/results/db/summary.out.txt @@ -0,0 +1,47 @@ +Ln:1 SUMMARY + +Column Type Rows NonNull Unique Avg Min Max StdDev p05 p25 p50 p75 p95 +KEY string 500 500 309 260.18 0 98 143.07 26.00 146.00 255.50 395.00 479.00 +VALUE string 500 500 309 null val_0 val_98 null null null null null null +Ln:3 SUMMARY + +Column Type Rows NonNull Unique Avg Min Max StdDev p05 p25 p50 p75 p95 +KEY string 500 500 309 260.18 0 98 143.07 26.00 145.50 255.50 394.50 479.00 +VALUE string 500 500 309 null val_0 val_98 null null null null null null +Ln:5 SUMMARY + +Column Type Rows NonNull Unique Avg Min Max StdDev p05 p25 p50 p75 p95 +KEY string 100 100 94 272.11 0 98 135.01 27.00 165.00 266.00 396.00 482.00 +VALUE string 100 100 94 null val_0 val_98 null null null null null null +Ln:7 SUMMARY + +Column Type Rows NonNull Unique Avg Min Max StdDev p05 p25 p50 p75 p95 +CODE string 823 823 823 null 00-0000 53-7199 null null null null null null +DESCRIPTION string 823 823 823 null Accountants and auditors Zoologists and wildlife biologists null null null null null null +TOTAL_EMP int 823 823 806 489748.24 340 134354250 4858790.94 4054.50 17270.00 49335.00 162662.50 1238941.00 +SALARY int 823 819 759 47963.63 16700 192780 25706.09 21860.00 30547.50 40700.00 58747.50 92025.50 +Ln:9 SUMMARY + +Column Type Rows NonNull Unique Avg Min Max StdDev p05 p25 p50 p75 p95 +code string 823 823 823 null 00-0000 53-7199 null null null null null null +total_emp int 823 823 806 489748.24 340 134354250 4858790.94 4054.50 17270.00 49335.00 162662.50 1238941.00 +salary int 823 819 759 47963.63 16700 192780 25706.09 21860.00 30547.50 40700.00 58747.50 92025.50 +Ln:11 SUMMARY + +KEY VALUE +348 5 val_230 5 +469 5 val_348 5 +401 5 val_401 5 +230 5 val_469 5 +Ln:13 SUMMARY + +CODE DESCRIPTION TOTAL_EMP SALARY +53-7199 1 Aircraft mechanics and service technicians 1 25500 2 null 4 +00-0000 1 Aircraft cargo handling supervisors 1 9910 2 34220 3 +11-0000 1 Agricultural workers, all other 1 112300 2 35470 3 +Ln:15 SUMMARY + +code description total_emp salary +53-7199 1 Aircraft mechanics and service technicians 1 25500 2 null 4 +00-0000 1 Aircraft cargo handling supervisors 1 9910 2 34220 3 +11-0000 1 Agricultural workers, all other 1 112300 2 35470 3 \ No newline at end of file
