HIVE-18986: Table rename will run java.lang.StackOverflowError in dataNucleus if the table contains large number of columns (Aihua Xu, reviewed by Yongzhi Chen)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f30efbeb Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f30efbeb Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f30efbeb Branch: refs/heads/master Commit: f30efbebf2ff85c55a5d9e3e2f86e0a51341df78 Parents: 11b0d85 Author: Aihua Xu <[email protected]> Authored: Wed Apr 18 17:05:08 2018 -0700 Committer: Aihua Xu <[email protected]> Committed: Wed Apr 25 16:10:30 2018 -0700 ---------------------------------------------------------------------- .../queries/clientpositive/alter_rename_table.q | 12 ++- .../clientpositive/alter_rename_table.q.out | 88 ++++++++++++++++++++ .../apache/hadoop/hive/metastore/Batchable.java | 86 +++++++++++++++++++ .../hive/metastore/MetaStoreDirectSql.java | 61 ++------------ .../hadoop/hive/metastore/ObjectStore.java | 45 ++++++---- .../hive/metastore/conf/MetastoreConf.java | 5 ++ 6 files changed, 227 insertions(+), 70 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/ql/src/test/queries/clientpositive/alter_rename_table.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/alter_rename_table.q b/ql/src/test/queries/clientpositive/alter_rename_table.q index 53fb230..bcf6ad5 100644 --- a/ql/src/test/queries/clientpositive/alter_rename_table.q +++ b/ql/src/test/queries/clientpositive/alter_rename_table.q @@ -36,4 +36,14 @@ create table source.src1 like default.src; load data local inpath '../../data/files/kv1.txt' overwrite into table source.src; ALTER TABLE source.src RENAME TO target.src1; -select * from target.src1 tablesample (10 rows); \ No newline at end of file +select * from target.src1 tablesample (10 rows); + +set metastore.rawstore.batch.size=1; +set metastore.try.direct.sql=false; + +create table source.src2 like default.src; +load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2; +ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS; +ALTER TABLE source.src2 RENAME TO target.src3; +DESC FORMATTED target.src3; +select * from target.src3 tablesample (10 rows); http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/ql/src/test/results/clientpositive/alter_rename_table.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/alter_rename_table.q.out b/ql/src/test/results/clientpositive/alter_rename_table.q.out index 732d8a2..9ac8fd2 100644 --- a/ql/src/test/results/clientpositive/alter_rename_table.q.out +++ b/ql/src/test/results/clientpositive/alter_rename_table.q.out @@ -261,3 +261,91 @@ POSTHOOK: Input: target@src1 278 val_278 98 val_98 484 val_484 +PREHOOK: query: create table source.src2 like default.src +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:source +PREHOOK: Output: source@src2 +POSTHOOK: query: create table source.src2 like default.src +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:source +POSTHOOK: Output: source@src2 +PREHOOK: query: load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: source@src2 +POSTHOOK: query: load data local inpath '../../data/files/kv1.txt' overwrite into table source.src2 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: source@src2 +PREHOOK: query: ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS +PREHOOK: type: QUERY +PREHOOK: Input: source@src2 +#### A masked pattern was here #### +PREHOOK: Output: source@src2 +POSTHOOK: query: ANALYZE TABlE source.src2 COMPUTE STATISTICS FOR COLUMNS +POSTHOOK: type: QUERY +POSTHOOK: Input: source@src2 +#### A masked pattern was here #### +POSTHOOK: Output: source@src2 +PREHOOK: query: ALTER TABLE source.src2 RENAME TO target.src3 +PREHOOK: type: ALTERTABLE_RENAME +PREHOOK: Input: source@src2 +PREHOOK: Output: source@src2 +POSTHOOK: query: ALTER TABLE source.src2 RENAME TO target.src3 +POSTHOOK: type: ALTERTABLE_RENAME +POSTHOOK: Input: source@src2 +POSTHOOK: Output: source@src2 +POSTHOOK: Output: target@src3 +PREHOOK: query: DESC FORMATTED target.src3 +PREHOOK: type: DESCTABLE +PREHOOK: Input: target@src3 +POSTHOOK: query: DESC FORMATTED target.src3 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: target@src3 +# col_name data_type comment +key string default +value string default + +# Detailed Table Information +Database: target +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} +#### A masked pattern was here #### + numFiles 1 + numRows 500 + rawDataSize 5312 + totalSize 5812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from target.src3 tablesample (10 rows) +PREHOOK: type: QUERY +PREHOOK: Input: target@src3 +#### A masked pattern was here #### +POSTHOOK: query: select * from target.src3 tablesample (10 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: target@src3 +#### A masked pattern was here #### +238 val_238 +86 val_86 +311 val_311 +27 val_27 +165 val_165 +409 val_409 +255 val_255 +278 val_278 +98 val_98 +484 val_484 http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java new file mode 100644 index 0000000..7e488a5 --- /dev/null +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/Batchable.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.ArrayList; +import java.util.List; +import javax.jdo.Query; + +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base class to add the batch process for DirectSQL or RawStore queries. + * 1. Provide the implementation of run() to process one batch + * 2. Call Batchable.runBatched() to process the whole dataset + * + * I: input type, R: result type + */ +public abstract class Batchable<I, R> { + private static final Logger LOG = LoggerFactory.getLogger(Batchable.class); + public static final int NO_BATCHING = -1; + + private List<Query> queries = null; + public abstract List<R> run(List<I> input) throws MetaException; + + public void addQueryAfterUse(Query query) { + if (queries == null) { + queries = new ArrayList<Query>(1); + } + queries.add(query); + } + protected void addQueryAfterUse(Batchable<?, ?> b) { + if (b.queries == null) { + return; + } + if (queries == null) { + queries = new ArrayList<Query>(1); + } + queries.addAll(b.queries); + } + public void closeAllQueries() { + for (Query q : queries) { + try { + q.closeAll(); + } catch (Throwable t) { + LOG.error("Failed to close a query", t); + } + } + } + + public static <I, R> List<R> runBatched( + final int batchSize, + List<I> input, + Batchable<I, R> runnable) throws MetaException { + if (batchSize == NO_BATCHING || batchSize >= input.size()) { + return runnable.run(input); + } + List<R> result = new ArrayList<R>(input.size()); + for (int fromIndex = 0, toIndex = 0; toIndex < input.size(); fromIndex = toIndex) { + toIndex = Math.min(fromIndex + batchSize, input.size()); + List<I> batchedInput = input.subList(fromIndex, toIndex); + List<R> batchedOutput = runnable.run(batchedInput); + if (batchedOutput != null) { + result.addAll(batchedOutput); + } + } + return result; + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index 997f5fd..4e0e887 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -455,7 +455,7 @@ class MetaStoreDirectSql { if (partNames.isEmpty()) { return Collections.emptyList(); } - return runBatched(partNames, new Batchable<String, Partition>() { + return Batchable.runBatched(batchSize, partNames, new Batchable<String, Partition>() { @Override public List<Partition> run(List<String> input) throws MetaException { String filter = "" + PARTITIONS + ".\"PART_NAME\" in (" + makeParams(input.size()) + ")"; @@ -596,7 +596,7 @@ class MetaStoreDirectSql { } // Get full objects. For Oracle/etc. do it in batches. - List<Partition> result = runBatched(sqlResult, new Batchable<Object, Partition>() { + List<Partition> result = Batchable.runBatched(batchSize, sqlResult, new Batchable<Object, Partition>() { @Override public List<Partition> run(List<Object> input) throws MetaException { return getPartitionsFromPartitionIds(catNameLcase, dbNameLcase, tblNameLcase, isView, @@ -1374,7 +1374,7 @@ class MetaStoreDirectSql { return ensureList(qResult); } }; - List<Object[]> list = runBatched(colNames, b); + List<Object[]> list = Batchable.runBatched(batchSize, colNames, b); if (list.isEmpty()) { return null; } @@ -1460,10 +1460,10 @@ class MetaStoreDirectSql { + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (%1$s) and \"PARTITION_NAME\" in (%2$s)" + " group by \"PARTITION_NAME\""; - List<Long> allCounts = runBatched(colNames, new Batchable<String, Long>() { + List<Long> allCounts = Batchable.runBatched(batchSize, colNames, new Batchable<String, Long>() { @Override public List<Long> run(final List<String> inputColName) throws MetaException { - return runBatched(partNames, new Batchable<String, Long>() { + return Batchable.runBatched(batchSize, partNames, new Batchable<String, Long>() { @Override public List<Long> run(List<String> inputPartNames) throws MetaException { long partsFound = 0; @@ -1503,10 +1503,10 @@ class MetaStoreDirectSql { final String tableName, final List<String> partNames, List<String> colNames, long partsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner, final boolean enableBitVector) throws MetaException { final boolean areAllPartsFound = (partsFound == partNames.size()); - return runBatched(colNames, new Batchable<String, ColumnStatisticsObj>() { + return Batchable.runBatched(batchSize, colNames, new Batchable<String, ColumnStatisticsObj>() { @Override public List<ColumnStatisticsObj> run(final List<String> inputColNames) throws MetaException { - return runBatched(partNames, new Batchable<String, ColumnStatisticsObj>() { + return Batchable.runBatched(batchSize, partNames, new Batchable<String, ColumnStatisticsObj>() { @Override public List<ColumnStatisticsObj> run(List<String> inputPartNames) throws MetaException { return columnStatisticsObjForPartitionsBatch(catName, dbName, tableName, inputPartNames, @@ -1918,13 +1918,13 @@ class MetaStoreDirectSql { } }; try { - return runBatched(partNames, b2); + return Batchable.runBatched(batchSize, partNames, b2); } finally { addQueryAfterUse(b2); } } }; - List<Object[]> list = runBatched(colNames, b); + List<Object[]> list = Batchable.runBatched(batchSize, colNames, b); List<ColumnStatistics> result = new ArrayList<ColumnStatistics>( Math.min(list.size(), partNames.size())); @@ -2027,49 +2027,6 @@ class MetaStoreDirectSql { } - private static abstract class Batchable<I, R> { - private List<Query> queries = null; - public abstract List<R> run(List<I> input) throws MetaException; - public void addQueryAfterUse(Query query) { - if (queries == null) { - queries = new ArrayList<Query>(1); - } - queries.add(query); - } - protected void addQueryAfterUse(Batchable<?, ?> b) { - if (b.queries == null) return; - if (queries == null) { - queries = new ArrayList<Query>(1); - } - queries.addAll(b.queries); - } - public void closeAllQueries() { - for (Query q : queries) { - try { - q.closeAll(); - } catch (Throwable t) { - LOG.error("Failed to close a query", t); - } - } - } - } - - private <I,R> List<R> runBatched(List<I> input, Batchable<I, R> runnable) throws MetaException { - if (batchSize == NO_BATCHING || batchSize >= input.size()) { - return runnable.run(input); - } - List<R> result = new ArrayList<R>(input.size()); - for (int fromIndex = 0, toIndex = 0; toIndex < input.size(); fromIndex = toIndex) { - toIndex = Math.min(fromIndex + batchSize, input.size()); - List<I> batchedInput = input.subList(fromIndex, toIndex); - List<R> batchedOutput = runnable.run(batchedInput); - if (batchedOutput != null) { - result.addAll(batchedOutput); - } - } - return result; - } - public List<SQLForeignKey> getForeignKeys(String catName, String parent_db_name, String parent_tbl_name, String foreign_db_name, String foreign_tbl_name) throws MetaException { http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java index 184ecb6..1abd99d 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -244,6 +244,7 @@ public class ObjectStore implements RawStore, Configurable { private static Properties prop = null; private static PersistenceManagerFactory pmf = null; private static boolean forTwoMetastoreTesting = false; + private int batchSize = Batchable.NO_BATCHING; private static final DateTimeFormatter YMDHMS_FORMAT = DateTimeFormatter.ofPattern( "yyyy_MM_dd_HH_mm_ss"); @@ -385,6 +386,8 @@ public class ObjectStore implements RawStore, Configurable { directSqlErrors = Metrics.getOrCreateCounter(MetricsConstants.DIRECTSQL_ERRORS); } + this.batchSize = MetastoreConf.getIntVar(conf, ConfVars.RAWSTORE_PARTITION_BATCH_SIZE); + if (!isInitialized) { throw new RuntimeException( "Unable to create persistence manager. Check dss.log for details"); @@ -8028,25 +8031,33 @@ public class ObjectStore implements RawStore, Configurable { try { openTransaction(); - List<MTableColumnStatistics> result = null; validateTableCols(table, colNames); Query query = queryWrapper.query = pm.newQuery(MTableColumnStatistics.class); - String filter = "tableName == t1 && dbName == t2 && catName == t3 && ("; - String paramStr = "java.lang.String t1, java.lang.String t2, java.lang.String t3"; - Object[] params = new Object[colNames.size() + 3]; - params[0] = table.getTableName(); - params[1] = table.getDbName(); - params[2] = table.getCatName(); - for (int i = 0; i < colNames.size(); ++i) { - filter += ((i == 0) ? "" : " || ") + "colName == c" + i; - paramStr += ", java.lang.String c" + i; - params[i + 3] = colNames.get(i); - } - filter += ")"; - query.setFilter(filter); - query.declareParameters(paramStr); - result = (List<MTableColumnStatistics>) query.executeWithArray(params); - pm.retrieveAll(result); + List<MTableColumnStatistics> result = + Batchable.runBatched(batchSize, colNames, new Batchable<String, MTableColumnStatistics>() { + @Override + public List<MTableColumnStatistics> run(List<String> input) + throws MetaException { + String filter = "tableName == t1 && dbName == t2 && catName == t3 && ("; + String paramStr = "java.lang.String t1, java.lang.String t2, java.lang.String t3"; + Object[] params = new Object[input.size() + 3]; + params[0] = table.getTableName(); + params[1] = table.getDbName(); + params[2] = table.getCatName(); + for (int i = 0; i < input.size(); ++i) { + filter += ((i == 0) ? "" : " || ") + "colName == c" + i; + paramStr += ", java.lang.String c" + i; + params[i + 3] = input.get(i); + } + filter += ")"; + query.setFilter(filter); + query.declareParameters(paramStr); + List<MTableColumnStatistics> paritial = (List<MTableColumnStatistics>) query.executeWithArray(params); + pm.retrieveAll(paritial); + return paritial; + } + }); + if (result.size() > colNames.size()) { throw new MetaException("Unexpected " + result.size() + " statistics for " + colNames.size() + " columns"); http://git-wip-us.apache.org/repos/asf/hive/blob/f30efbeb/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 552eeca..35aa40c 100644 --- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -856,6 +856,11 @@ public class MetastoreConf { "hive.metastore.wm.default.pool.size", 4, "The size of a default pool to create when creating an empty resource plan;\n" + "If not positive, no default pool will be created."), + RAWSTORE_PARTITION_BATCH_SIZE("metastore.rawstore.batch.size", + "metastore.rawstore.batch.size", -1, + "Batch size for partition and other object retrieval from the underlying DB in JDO.\n" + + "The JDO implementation such as DataNucleus may run into issues when the generated queries are\n" + + "too large. Use this parameter to break the query into multiple batches. -1 means no batching."), // Hive values we have copied and use as is // These two are used to indicate that we are running tests
