This is an automated email from the ASF dual-hosted git repository.
dengzh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 72fd26d207a HIVE-27994: Optimize renaming the partitioned table
(#4995) (Zhihua Deng, reviewed by Butao Zhang, Sai Hemanth Gantasala)
72fd26d207a is described below
commit 72fd26d207a2943f0535fa96330bedc244fbe10a
Author: dengzh <[email protected]>
AuthorDate: Sat Jan 20 10:02:24 2024 +0800
HIVE-27994: Optimize renaming the partitioned table (#4995) (Zhihua Deng,
reviewed by Butao Zhang, Sai Hemanth Gantasala)
---
ql/src/test/queries/clientpositive/rename_table.q | 40 +++
.../results/clientpositive/llap/rename_table.q.out | 378 +++++++++++++++++++++
.../hadoop/hive/metastore/DirectSqlUpdatePart.java | 75 ++--
.../hadoop/hive/metastore/HiveAlterHandler.java | 76 +++--
.../hadoop/hive/metastore/StatObjectConverter.java | 9 +-
5 files changed, 519 insertions(+), 59 deletions(-)
diff --git a/ql/src/test/queries/clientpositive/rename_table.q
b/ql/src/test/queries/clientpositive/rename_table.q
new file mode 100644
index 00000000000..1e4f2fcdd55
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/rename_table.q
@@ -0,0 +1,40 @@
+--! qt:dataset:src
+--! qt:dataset:part
+-- This test verifies that if the table after rename can still fetch the
column statistics
+set hive.stats.kll.enable=true;
+set metastore.stats.fetch.bitvector=true;
+set metastore.stats.fetch.kll=true;
+set hive.stats.autogather=true;
+set hive.stats.column.autogather=true;
+
+CREATE TABLE rename_partition_table0 (key STRING, value STRING) PARTITIONED BY
(part STRING)
+STORED AS ORC;
+
+INSERT OVERWRITE TABLE rename_partition_table0 PARTITION (part = '1') SELECT *
FROM src where rand(1) < 0.5;
+ALTER TABLE rename_partition_table0 ADD COLUMNS (new_col INT);
+INSERT OVERWRITE TABLE rename_partition_table0 PARTITION (part = '2') SELECT
src.*, 1 FROM src;
+
+ALTER TABLE rename_partition_table0 RENAME TO rename_partition_table1;
+DESCRIBE FORMATTED rename_partition_table1;
+DESCRIBE FORMATTED rename_partition_table1 PARTITION (part='1') key;
+DESCRIBE FORMATTED rename_partition_table1 PARTITION (part='1') value;
+DESCRIBE FORMATTED rename_partition_table1 PARTITION (part='2') key;
+DESCRIBE FORMATTED rename_partition_table1 PARTITION (part='2') value;
+DESCRIBE FORMATTED rename_partition_table1 PARTITION (part='2') new_col;
+
+CREATE EXTERNAL TABLE rename_partition_table_ext0 (key STRING, value STRING)
PARTITIONED BY (part STRING)
+STORED AS ORC;
+
+INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION (part = '1')
SELECT * FROM src where rand(1) < 0.5;
+ALTER TABLE rename_partition_table_ext0 CHANGE COLUMN value val STRING CASCADE;
+INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION (part = '2')
SELECT * FROM src;
+
+ALTER TABLE rename_partition_table_ext0 RENAME TO rename_partition_table_ext1;
+DESCRIBE FORMATTED rename_partition_table_ext1;
+DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION (part='1') key;
+DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION (part='1') val;
+DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION (part='2') key;
+DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION (part='2') val;
+
+DROP TABLE rename_partition_table1;
+DROP TABLE rename_partition_table_ext1;
diff --git a/ql/src/test/results/clientpositive/llap/rename_table.q.out
b/ql/src/test/results/clientpositive/llap/rename_table.q.out
new file mode 100644
index 00000000000..014d629ea86
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/rename_table.q.out
@@ -0,0 +1,378 @@
+PREHOOK: query: CREATE TABLE rename_partition_table0 (key STRING, value
STRING) PARTITIONED BY (part STRING)
+STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table0
+POSTHOOK: query: CREATE TABLE rename_partition_table0 (key STRING, value
STRING) PARTITIONED BY (part STRING)
+STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table0
+PREHOOK: query: INSERT OVERWRITE TABLE rename_partition_table0 PARTITION (part
= '1') SELECT * FROM src where rand(1) < 0.5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@rename_partition_table0@part=1
+POSTHOOK: query: INSERT OVERWRITE TABLE rename_partition_table0 PARTITION
(part = '1') SELECT * FROM src where rand(1) < 0.5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@rename_partition_table0@part=1
+POSTHOOK: Lineage: rename_partition_table0 PARTITION(part=1).key SIMPLE
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: rename_partition_table0 PARTITION(part=1).value SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: ALTER TABLE rename_partition_table0 ADD COLUMNS (new_col INT)
+PREHOOK: type: ALTERTABLE_ADDCOLS
+PREHOOK: Input: default@rename_partition_table0
+PREHOOK: Output: default@rename_partition_table0
+POSTHOOK: query: ALTER TABLE rename_partition_table0 ADD COLUMNS (new_col INT)
+POSTHOOK: type: ALTERTABLE_ADDCOLS
+POSTHOOK: Input: default@rename_partition_table0
+POSTHOOK: Output: default@rename_partition_table0
+PREHOOK: query: INSERT OVERWRITE TABLE rename_partition_table0 PARTITION (part
= '2') SELECT src.*, 1 FROM src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@rename_partition_table0@part=2
+POSTHOOK: query: INSERT OVERWRITE TABLE rename_partition_table0 PARTITION
(part = '2') SELECT src.*, 1 FROM src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@rename_partition_table0@part=2
+POSTHOOK: Lineage: rename_partition_table0 PARTITION(part=2).key SIMPLE
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: rename_partition_table0 PARTITION(part=2).new_col SIMPLE []
+POSTHOOK: Lineage: rename_partition_table0 PARTITION(part=2).value SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: ALTER TABLE rename_partition_table0 RENAME TO
rename_partition_table1
+PREHOOK: type: ALTERTABLE_RENAME
+PREHOOK: Input: default@rename_partition_table0
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table0
+PREHOOK: Output: default@rename_partition_table1
+POSTHOOK: query: ALTER TABLE rename_partition_table0 RENAME TO
rename_partition_table1
+POSTHOOK: type: ALTERTABLE_RENAME
+POSTHOOK: Input: default@rename_partition_table0
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table0
+POSTHOOK: Output: default@rename_partition_table1
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+# col_name data_type comment
+key string
+value string
+new_col int
+
+# Partition Information
+# col_name data_type comment
+part string
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
+ bucketing_version 2
+#### A masked pattern was here ####
+ numFiles 2
+ numPartitions 2
+ numRows 746
+ rawDataSize 133296
+ totalSize 4760
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='1') key
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='1') key
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+col_name key
+data_type string
+min
+max
+num_nulls 0
+distinct_count 198
+avg_col_len 2.8089430894308944
+max_col_len 3
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='1') value
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='1') value
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+col_name value
+data_type string
+min
+max
+num_nulls 0
+distinct_count 191
+avg_col_len 6.808943089430894
+max_col_len 7
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') key
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') key
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+col_name key
+data_type string
+min
+max
+num_nulls 0
+distinct_count 316
+avg_col_len 2.812
+max_col_len 3
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') value
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') value
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+col_name value
+data_type string
+min
+max
+num_nulls 0
+distinct_count 307
+avg_col_len 6.812
+max_col_len 7
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') new_col
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table1 PARTITION
(part='2') new_col
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table1
+col_name new_col
+data_type int
+min 1
+max 1
+num_nulls 0
+distinct_count 1
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+histogram Q1: 1, Q2: 1, Q3: 1
+comment from deserializer
+PREHOOK: query: CREATE EXTERNAL TABLE rename_partition_table_ext0 (key STRING,
value STRING) PARTITIONED BY (part STRING)
+STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table_ext0
+POSTHOOK: query: CREATE EXTERNAL TABLE rename_partition_table_ext0 (key
STRING, value STRING) PARTITIONED BY (part STRING)
+STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table_ext0
+PREHOOK: query: INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION
(part = '1') SELECT * FROM src where rand(1) < 0.5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@rename_partition_table_ext0@part=1
+POSTHOOK: query: INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION
(part = '1') SELECT * FROM src where rand(1) < 0.5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@rename_partition_table_ext0@part=1
+POSTHOOK: Lineage: rename_partition_table_ext0 PARTITION(part=1).key SIMPLE
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: rename_partition_table_ext0 PARTITION(part=1).value SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: ALTER TABLE rename_partition_table_ext0 CHANGE COLUMN value
val STRING CASCADE
+PREHOOK: type: ALTERTABLE_RENAMECOL
+PREHOOK: Input: default@rename_partition_table_ext0
+PREHOOK: Output: default@rename_partition_table_ext0
+PREHOOK: Output: default@rename_partition_table_ext0@part=1
+POSTHOOK: query: ALTER TABLE rename_partition_table_ext0 CHANGE COLUMN value
val STRING CASCADE
+POSTHOOK: type: ALTERTABLE_RENAMECOL
+POSTHOOK: Input: default@rename_partition_table_ext0
+POSTHOOK: Output: default@rename_partition_table_ext0
+POSTHOOK: Output: default@rename_partition_table_ext0@part=1
+PREHOOK: query: INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION
(part = '2') SELECT * FROM src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@rename_partition_table_ext0@part=2
+POSTHOOK: query: INSERT OVERWRITE TABLE rename_partition_table_ext0 PARTITION
(part = '2') SELECT * FROM src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@rename_partition_table_ext0@part=2
+POSTHOOK: Lineage: rename_partition_table_ext0 PARTITION(part=2).key SIMPLE
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: rename_partition_table_ext0 PARTITION(part=2).val SIMPLE
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: ALTER TABLE rename_partition_table_ext0 RENAME TO
rename_partition_table_ext1
+PREHOOK: type: ALTERTABLE_RENAME
+PREHOOK: Input: default@rename_partition_table_ext0
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table_ext0
+PREHOOK: Output: default@rename_partition_table_ext1
+POSTHOOK: query: ALTER TABLE rename_partition_table_ext0 RENAME TO
rename_partition_table_ext1
+POSTHOOK: type: ALTERTABLE_RENAME
+POSTHOOK: Input: default@rename_partition_table_ext0
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table_ext0
+POSTHOOK: Output: default@rename_partition_table_ext1
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+# col_name data_type comment
+key string
+val string
+
+# Partition Information
+# col_name data_type comment
+part string
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: EXTERNAL_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
+ EXTERNAL TRUE
+ bucketing_version 2
+#### A masked pattern was here ####
+ numFiles 2
+ numPartitions 2
+ numRows 746
+ rawDataSize 131296
+ totalSize 4669
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='1') key
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='1') key
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+col_name key
+data_type string
+min
+max
+num_nulls 0
+distinct_count 198
+avg_col_len 2.8089430894308944
+max_col_len 3
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='1') val
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='1') val
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+col_name val
+data_type string
+min
+max
+num_nulls
+distinct_count
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='2') key
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='2') key
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+col_name key
+data_type string
+min
+max
+num_nulls 0
+distinct_count 316
+avg_col_len 2.812
+max_col_len 3
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='2') val
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: query: DESCRIBE FORMATTED rename_partition_table_ext1 PARTITION
(part='2') val
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+col_name val
+data_type string
+min
+max
+num_nulls 0
+distinct_count 307
+avg_col_len 6.812
+max_col_len 7
+num_trues
+num_falses
+bit_vector HL
+histogram
+comment from deserializer
+PREHOOK: query: DROP TABLE rename_partition_table1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@rename_partition_table1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table1
+POSTHOOK: query: DROP TABLE rename_partition_table1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@rename_partition_table1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table1
+PREHOOK: query: DROP TABLE rename_partition_table_ext1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@rename_partition_table_ext1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rename_partition_table_ext1
+POSTHOOK: query: DROP TABLE rename_partition_table_ext1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@rename_partition_table_ext1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rename_partition_table_ext1
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/DirectSqlUpdatePart.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/DirectSqlUpdatePart.java
index f6e41f09094..67c293ee64f 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/DirectSqlUpdatePart.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/DirectSqlUpdatePart.java
@@ -64,6 +64,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.locks.ReentrantLock;
@@ -181,7 +182,7 @@ class DirectSqlUpdatePart {
e -> e.partitionId).collect(Collectors.toList()
);
- prefix.append("select \"PART_ID\", \"COLUMN_NAME\" from \"PART_COL_STATS\"
WHERE ");
+ prefix.append("select \"PART_ID\", \"COLUMN_NAME\", \"ENGINE\" from
\"PART_COL_STATS\" WHERE ");
TxnUtils.buildQueryWithINClause(conf, queries, prefix, suffix,
partIdList, "\"PART_ID\"", true, false);
@@ -191,7 +192,7 @@ class DirectSqlUpdatePart {
LOG.debug("Going to execute query " + query);
rs = statement.executeQuery(query);
while (rs.next()) {
- selectedParts.add(new PartColNameInfo(rs.getLong(1),
rs.getString(2)));
+ selectedParts.add(new PartColNameInfo(rs.getLong(1),
rs.getString(2), rs.getString(3)));
}
} finally {
close(rs, statement, null);
@@ -207,7 +208,8 @@ class DirectSqlUpdatePart {
statsDesc.setCatName(tbl.getCatName());
}
for (ColumnStatisticsObj statisticsObj : colStats.getStatsObj()) {
- PartColNameInfo temp = new PartColNameInfo(partId,
statisticsObj.getColName());
+ PartColNameInfo temp = new PartColNameInfo(partId,
statisticsObj.getColName(),
+ colStats.getEngine());
if (selectedParts.contains(temp)) {
updateMap.put(temp, StatObjectConverter.
convertToMPartitionColumnStatistics(null, statsDesc,
statisticsObj, colStats.getEngine()));
@@ -221,25 +223,46 @@ class DirectSqlUpdatePart {
private void updatePartColStatTable(Map<PartColNameInfo,
MPartitionColumnStatistics> updateMap,
Connection dbConn) throws
SQLException, MetaException, NoSuchObjectException {
- PreparedStatement pst = null;
- for (Map.Entry entry : updateMap.entrySet()) {
- PartColNameInfo partColNameInfo = (PartColNameInfo) entry.getKey();
- Long partId = partColNameInfo.partitionId;
- MPartitionColumnStatistics mPartitionColumnStatistics =
(MPartitionColumnStatistics) entry.getValue();
- String update = "UPDATE \"PART_COL_STATS\" SET ";
- update +=
StatObjectConverter.getUpdatedColumnSql(mPartitionColumnStatistics);
- update += " WHERE \"PART_ID\" = " + partId + " AND "
- + " \"COLUMN_NAME\" = " +
quoteString(mPartitionColumnStatistics.getColName());
- try {
- pst = dbConn.prepareStatement(update);
-
StatObjectConverter.initUpdatedColumnStatement(mPartitionColumnStatistics, pst);
- LOG.debug("Going to execute update " + update);
- int numUpdate = pst.executeUpdate();
- if (numUpdate != 1) {
- throw new MetaException("Invalid state of PART_COL_STATS for
PART_ID " + partId);
+ Map<String, List<Map.Entry<PartColNameInfo, MPartitionColumnStatistics>>>
updates = new HashMap<>();
+ for (Map.Entry<PartColNameInfo, MPartitionColumnStatistics> entry :
updateMap.entrySet()) {
+ MPartitionColumnStatistics mPartitionColumnStatistics = entry.getValue();
+ StringBuilder update = new StringBuilder("UPDATE \"PART_COL_STATS\" SET
")
+
.append(StatObjectConverter.getUpdatedColumnSql(mPartitionColumnStatistics))
+ .append(" WHERE \"PART_ID\" = ? AND \"COLUMN_NAME\" = ? AND
\"ENGINE\" = ?");
+ updates.computeIfAbsent(update.toString(), k -> new
ArrayList<>()).add(entry);
+ }
+
+ for (Map.Entry<String, List<Map.Entry<PartColNameInfo,
MPartitionColumnStatistics>>> entry : updates.entrySet()) {
+ List<Long> partIds = new ArrayList<>();
+ try (PreparedStatement pst = dbConn.prepareStatement(entry.getKey())) {
+ List<Map.Entry<PartColNameInfo, MPartitionColumnStatistics>> entries =
entry.getValue();
+ for (Map.Entry<PartColNameInfo, MPartitionColumnStatistics> partStats
: entries) {
+ PartColNameInfo partColNameInfo = partStats.getKey();
+ MPartitionColumnStatistics mPartitionColumnStatistics =
partStats.getValue();
+ int colIdx =
StatObjectConverter.initUpdatedColumnStatement(mPartitionColumnStatistics, pst);
+ pst.setLong(colIdx++, partColNameInfo.partitionId);
+ pst.setString(colIdx++, mPartitionColumnStatistics.getColName());
+ pst.setString(colIdx++, mPartitionColumnStatistics.getEngine());
+ partIds.add(partColNameInfo.partitionId);
+ pst.addBatch();
+ if (partIds.size() == maxBatchSize) {
+ LOG.debug("Going to execute updates on part: {}", partIds);
+ verifyUpdates(pst.executeBatch(), partIds);
+ partIds = new ArrayList<>();
+ }
+ }
+ if (!partIds.isEmpty()) {
+ LOG.debug("Going to execute updates on part: {}", partIds);
+ verifyUpdates(pst.executeBatch(), partIds);
}
- } finally {
- closeStmt(pst);
+ }
+ }
+ }
+
+ private void verifyUpdates(int[] numUpdates, List<Long> partIds) throws
MetaException {
+ for (int i = 0; i < numUpdates.length; i++) {
+ if (numUpdates[i] != 1) {
+ throw new MetaException("Invalid state of PART_COL_STATS for PART_ID "
+ partIds.get(i));
}
}
}
@@ -1501,9 +1524,11 @@ class DirectSqlUpdatePart {
private static final class PartColNameInfo {
long partitionId;
String colName;
- public PartColNameInfo(long partitionId, String colName) {
+ String engine;
+ public PartColNameInfo(long partitionId, String colName, String engine) {
this.partitionId = partitionId;
this.colName = colName;
+ this.engine = engine;
}
@Override
@@ -1527,10 +1552,10 @@ class DirectSqlUpdatePart {
if (this.partitionId != other.partitionId) {
return false;
}
- if (this.colName.equalsIgnoreCase(other.colName)) {
- return true;
+ if (!this.colName.equalsIgnoreCase(other.colName)) {
+ return false;
}
- return false;
+ return Objects.equals(this.engine, other.engine);
}
}
}
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java
index 1c18631e1cc..a2807961b75 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HiveAlterHandler.java
@@ -18,10 +18,8 @@
package org.apache.hadoop.hive.metastore;
import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Lists;
-import com.google.common.collect.Multimap;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.common.AcidMetaDataFile.DataFormat;
@@ -63,6 +61,7 @@ import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -352,7 +351,7 @@ public class HiveAlterHandler implements AlterHandler {
// also the location field in partition
parts = msdb.getPartitions(catName, dbname, name, -1);
- Multimap<Partition, ColumnStatistics> columnStatsNeedUpdated =
ArrayListMultimap.create();
+ Map<List<FieldSchema>, List<Partition>> partsByCols = new
HashMap<>();
for (Partition part : parts) {
String oldPartLoc = part.getSd().getLocation();
if (dataWasMoved && oldPartLoc.contains(oldTblLocPath)) {
@@ -363,44 +362,57 @@ public class HiveAlterHandler implements AlterHandler {
}
part.setDbName(newDbName);
part.setTableName(newTblName);
- List<ColumnStatistics> multiColStats =
updateOrGetPartitionColumnStats(msdb, catName, dbname, name,
- part.getValues(), part.getSd().getCols(), oldt, part, null,
null);
- for (ColumnStatistics colStats : multiColStats) {
- columnStatsNeedUpdated.put(part, colStats);
+ partsByCols.computeIfAbsent(part.getSd().getCols(), k -> new
ArrayList<>()).add(part);
+ }
+ Map<String, Map<String, ColumnStatistics>> engineToColStats = new
HashMap<>();
+ if (rename) {
+ // If this is the table rename, get the partition column
statistics first
+ for (Map.Entry<List<FieldSchema>, List<Partition>> entry :
partsByCols.entrySet()) {
+ List<String> colNames = entry.getKey().stream().map(fs ->
fs.getName()).collect(Collectors.toList());
+ List<String> partNames = new ArrayList<>();
+ for (Partition part : entry.getValue()) {
+ partNames.add(Warehouse.makePartName(oldt.getPartitionKeys(),
part.getValues()));
+ }
+ List<List<ColumnStatistics>> colStats =
+ msdb.getPartitionColumnStatistics(catName, dbname, name,
partNames, colNames);
+ for (List<ColumnStatistics> cs : colStats) {
+ if (cs != null && !cs.isEmpty()) {
+ String engine = cs.get(0).getEngine();
+ cs.stream().forEach(stats -> {
+ stats.getStatsDesc().setDbName(newDbName);
+ stats.getStatsDesc().setTableName(newTblName);
+ String partName = stats.getStatsDesc().getPartName();
+ engineToColStats.computeIfAbsent(engine, key -> new
HashMap<>()).put(partName, stats);
+ });
+ }
+ }
}
}
// Do not verify stats parameters on a partitioned table.
msdb.alterTable(catName, dbname, name, newt, null);
+ int partitionBatchSize = MetastoreConf.getIntVar(handler.getConf(),
+ MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX);
+ String catalogName = catName;
// alterPartition is only for changing the partition location in the
table rename
if (dataWasMoved) {
-
- int partsToProcess = parts.size();
- int partitionBatchSize = MetastoreConf.getIntVar(handler.getConf(),
- MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX);
- int batchStart = 0;
- while (partsToProcess > 0) {
- int batchEnd = Math.min(batchStart + partitionBatchSize,
parts.size());
- List<Partition> partBatch = parts.subList(batchStart, batchEnd);
- int partBatchSize = partBatch.size();
- partsToProcess -= partBatchSize;
- batchStart += partBatchSize;
- List<List<String>> partValues = new ArrayList<>(partBatchSize);
- for (Partition part : partBatch) {
- partValues.add(part.getValues());
+ Batchable.runBatched(partitionBatchSize, parts, new
Batchable<Partition, Void>() {
+ @Override
+ public List<Void> run(List<Partition> input) throws Exception {
+ msdb.alterPartitions(catalogName, newDbName, newTblName,
+
input.stream().map(Partition::getValues).collect(Collectors.toList()),
+ input, newt.getWriteId(), writeIdList);
+ return Collections.emptyList();
}
- msdb.alterPartitions(catName, newDbName, newTblName, partValues,
- partBatch, newt.getWriteId(), writeIdList);
- }
+ });
}
Deadline.checkTimeout();
- Table table = msdb.getTable(catName, newDbName, newTblName);
- MTable mTable = msdb.ensureGetMTable(catName, newDbName, newTblName);
- for (Entry<Partition, ColumnStatistics> partColStats :
columnStatsNeedUpdated.entries()) {
- ColumnStatistics newPartColStats = partColStats.getValue();
- newPartColStats.getStatsDesc().setDbName(newDbName);
- newPartColStats.getStatsDesc().setTableName(newTblName);
- msdb.updatePartitionColumnStatistics(table, mTable,
newPartColStats,
- partColStats.getKey().getValues(), writeIdList,
newt.getWriteId());
+ if (rename) {
+ for (Entry<String, Map<String, ColumnStatistics>> entry :
engineToColStats.entrySet()) {
+ // We will send ALTER_TABLE event after the db change, set
listeners to null so that no extra
+ // event that could pollute the replication will be sent.
+ msdb.updatePartitionColumnStatisticsInBatch(entry.getValue(),
oldt,
+ null, writeIdList, newt.getWriteId());
+ }
}
} else {
msdb.alterTable(catName, dbname, name, newt, writeIdList);
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
index 5848abd2064..163c855833e 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java
@@ -288,11 +288,13 @@ public class StatObjectConverter {
if (mStatsObj.getNumNulls() != null) {
setStmt.append("\"NUM_NULLS\" = ? ,");
}
- setStmt.append("\"ENGINE\" = ? ");
+ setStmt.append("\"ENGINE\" = ? ,");
+ setStmt.append("\"DB_NAME\" = ? ,");
+ setStmt.append("\"TABLE_NAME\" = ? ");
return setStmt.toString();
}
- public static void initUpdatedColumnStatement(MPartitionColumnStatistics
mStatsObj,
+ public static int initUpdatedColumnStatement(MPartitionColumnStatistics
mStatsObj,
PreparedStatement pst)
throws SQLException {
int colIdx = 1;
if (mStatsObj.getAvgColLen() != null) {
@@ -339,6 +341,9 @@ public class StatObjectConverter {
pst.setObject(colIdx++, mStatsObj.getNumNulls());
}
pst.setString(colIdx++, mStatsObj.getEngine());
+ pst.setString(colIdx++, mStatsObj.getDbName());
+ pst.setString(colIdx++, mStatsObj.getTableName());
+ return colIdx;
}
public static ColumnStatisticsObj getTableColumnStatisticsObj(