AMBARI-13525 Exception in collector logs for JOIN queries (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/646de575 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/646de575 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/646de575 Branch: refs/heads/branch-dev-patch-upgrade Commit: 646de5754fd031270ed265cf7332a925ada24df2 Parents: 2232471 Author: Dmytro Sen <[email protected]> Authored: Mon Nov 2 18:08:47 2015 +0200 Committer: Dmytro Sen <[email protected]> Committed: Mon Nov 2 18:08:47 2015 +0200 ---------------------------------------------------------------------- .../metrics/timeline/PhoenixHBaseAccessor.java | 22 ++++++++ .../timeline/query/PhoenixTransactSQL.java | 20 ++++++- .../timeline/PhoenixHBaseAccessorTest.java | 58 ++++++++++++++++++-- .../timeline/TestPhoenixTransactSQL.java | 24 ++++++++ .../0.1.0/configuration/ams-hbase-site.xml | 22 +++++++- .../stacks/HDP/2.0.6/services/stack_advisor.py | 4 ++ .../stacks/2.2/common/test_stack_advisor.py | 1 + 7 files changed, 145 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java index 214cf1d..3ce30fd 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java @@ -21,6 +21,7 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.metrics2.sink.timeline.Precision; @@ -39,6 +40,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline. import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.SplitByMetricNamesCondition; import org.apache.hadoop.yarn.util.timeline.TimelineUtils; +import org.apache.phoenix.exception.PhoenixIOException; import org.apache.phoenix.exception.SQLExceptionCode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.TypeReference; @@ -463,6 +465,26 @@ public class PhoenixHBaseAccessor { } } + } catch (PhoenixIOException pioe) { + Throwable pioe2 = pioe.getCause(); + // Need to find out if this is exception "Could not find hash cache + // for joinId" or another PhoenixIOException + if (pioe2 instanceof PhoenixIOException && + pioe2.getCause() instanceof DoNotRetryIOException) { + String className = null; + for (StackTraceElement ste : pioe2.getCause().getStackTrace()) { + className = ste.getClassName(); + } + + if (className != null && className.equals("HashJoinRegionScanner")) { + LOG.error("The cache might have expired and have been removed. Try to" + + " increase the cache size by setting bigger value for " + + "phoenix.coprocessor.maxMetaDataCacheSize in ams-hbase-site config." + + " Falling back to sort-merge join algorithm."); + PhoenixTransactSQL.setSortMergeJoinEnabled(true); + } + } + throw pioe; } catch (RuntimeException ex) { // We need to find out if this is a real IO exception // or exception "maxStamp is smaller than minStamp" http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java index 1ab92a0..092c983 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java @@ -165,7 +165,7 @@ public class PhoenixTransactSQL { * Different queries for a number and a single hosts are used due to bug * in Apache Phoenix */ - public static final String GET_LATEST_METRIC_SQL = "SELECT " + + public static final String GET_LATEST_METRIC_SQL = "SELECT %s " + "E.METRIC_NAME AS METRIC_NAME, E.HOSTNAME AS HOSTNAME, " + "E.APP_ID AS APP_ID, E.INSTANCE_ID AS INSTANCE_ID, " + "E.SERVER_TIME AS SERVER_TIME, E.START_TIME AS START_TIME, " + @@ -257,6 +257,7 @@ public class PhoenixTransactSQL { public static final long NATIVE_TIME_RANGE_DELTA = 120000; // 2 minutes public static final long HOUR = 3600000; // 1 hour public static final long DAY = 86400000; // 1 day + private static boolean sortMergeJoinEnabled = false; /** * Filter to optimize HBase scan by using file timestamps. This prevents @@ -268,6 +269,22 @@ public class PhoenixTransactSQL { return String.format("/*+ NATIVE_TIME_RANGE(%s) */", (startTime - delta)); } + /** + * Falling back to sort merge join algorithm if default queries fail. + * + * @return Phoenix Hint String + */ + public static String getLatestMetricsHints() { + if (sortMergeJoinEnabled) { + return "/*+ USE_SORT_MERGE_JOIN NO_CACHE */"; + } + return ""; + } + + public static void setSortMergeJoinEnabled(boolean sortMergeJoinEnabled) { + PhoenixTransactSQL.sortMergeJoinEnabled = sortMergeJoinEnabled; + } + public static PreparedStatement prepareGetMetricsSqlStmt(Connection connection, Condition condition) throws SQLException { @@ -448,6 +465,7 @@ public class PhoenixTransactSQL { stmtStr = condition.getStatement(); } else { stmtStr = String.format(GET_LATEST_METRIC_SQL, + getLatestMetricsHints(), METRICS_RECORD_TABLE_NAME, METRICS_RECORD_TABLE_NAME, condition.getConditionClause()); http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java index 2d48a58..0a6f120 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.metrics2.sink.timeline.Precision; import org.apache.hadoop.metrics2.sink.timeline.TimelineMetrics; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.aggregators.Function; @@ -25,6 +26,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline. import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.ConnectionProvider; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.DefaultCondition; import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL; +import org.apache.phoenix.exception.PhoenixIOException; import org.easymock.EasyMock; import org.junit.Test; import org.junit.runner.RunWith; @@ -43,10 +45,8 @@ import java.util.List; import java.util.Map; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; -/** - * Created by user on 9/7/15. - */ @RunWith(PowerMockRunner.class) @PrepareForTest(PhoenixTransactSQL.class) public class PhoenixHBaseAccessorTest { @@ -96,7 +96,8 @@ public class PhoenixHBaseAccessorTest { } @Test - public void testGetMetricRecordsException() throws SQLException, IOException { + public void testGetMetricRecordsIOException() + throws SQLException, IOException { Configuration hbaseConf = new Configuration(); hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum"); @@ -139,4 +140,53 @@ public class PhoenixHBaseAccessorTest { EasyMock.verify(preparedStatementMock, rsMock, io, runtimeException); } + @Test + public void testGetMetricRecordsPhoenixIOExceptionDoNotRetryException() + throws SQLException, IOException { + + Configuration hbaseConf = new Configuration(); + hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum"); + Configuration metricsConf = new Configuration(); + + ConnectionProvider connectionProvider = new ConnectionProvider() { + @Override + public Connection getConnection() throws SQLException { + return null; + } + }; + + PhoenixHBaseAccessor accessor = new PhoenixHBaseAccessor(hbaseConf, metricsConf, connectionProvider); + + List<String> metricNames = new LinkedList<>(); + List<String> hostnames = new LinkedList<>(); + Map<String, List<Function>> metricFunctions = new HashMap<>(); + + PowerMock.mockStatic(PhoenixTransactSQL.class); + PreparedStatement preparedStatementMock = EasyMock.createNiceMock(PreparedStatement.class); + Condition condition = new DefaultCondition(metricNames, hostnames, "appid", "instanceid", null, null, Precision.SECONDS, 10, true); + EasyMock.expect(PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(null, condition)).andReturn(preparedStatementMock).once(); + PhoenixTransactSQL.setSortMergeJoinEnabled(true); + EasyMock.expectLastCall(); + ResultSet rsMock = EasyMock.createNiceMock(ResultSet.class); + PhoenixIOException pioe1 = EasyMock.createNiceMock(PhoenixIOException.class); + PhoenixIOException pioe2 = EasyMock.createNiceMock(PhoenixIOException.class); + DoNotRetryIOException dnrioe = EasyMock.createNiceMock(DoNotRetryIOException.class); + EasyMock.expect(preparedStatementMock.executeQuery()).andThrow(pioe1); + EasyMock.expect(pioe1.getCause()).andReturn(pioe2).atLeastOnce(); + EasyMock.expect(pioe2.getCause()).andReturn(dnrioe).atLeastOnce(); + StackTraceElement stackTrace[] = new StackTraceElement[]{new StackTraceElement("HashJoinRegionScanner","method","file",1)}; + EasyMock.expect(dnrioe.getStackTrace()).andReturn(stackTrace).atLeastOnce(); + + + PowerMock.replayAll(); + EasyMock.replay(preparedStatementMock, rsMock, pioe1, pioe2, dnrioe); + try { + TimelineMetrics tml = accessor.getMetricRecords(condition, metricFunctions); + fail(); + } catch (Exception e) { + //NOP + } + PowerMock.verifyAll(); + } + } http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java index ec6a472..6bf15c7 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java @@ -382,6 +382,30 @@ public class TestPhoenixTransactSQL { } @Test + public void testPrepareGetLatestMetricSqlStmtSortMergeJoinAlgorithm() + throws SQLException { + Condition condition = new DefaultCondition( + Arrays.asList("cpu_user", "mem_free"), Arrays.asList("h1"), + "a1", "i1", null, null, null, null, false); + Connection connection = createNiceMock(Connection.class); + PreparedStatement preparedStatement = createNiceMock(PreparedStatement.class); + ParameterMetaData parameterMetaData = createNiceMock(ParameterMetaData.class); + Capture<String> stmtCapture = new Capture<String>(); + expect(connection.prepareStatement(EasyMock.and(EasyMock.anyString(), EasyMock.capture(stmtCapture)))) + .andReturn(preparedStatement); + expect(preparedStatement.getParameterMetaData()) + .andReturn(parameterMetaData).anyTimes(); + expect(parameterMetaData.getParameterCount()) + .andReturn(6).anyTimes(); + + replay(connection, preparedStatement, parameterMetaData); + PhoenixTransactSQL.setSortMergeJoinEnabled(true); + PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(connection, condition); + String stmt = stmtCapture.getValue(); + Assert.assertTrue(stmt.contains("/*+ USE_SORT_MERGE_JOIN NO_CACHE */")); + } + + @Test public void testPrepareGetMetricsPrecisionHours() throws SQLException { Condition condition = new DefaultCondition( Arrays.asList("cpu_user", "mem_free"), Collections.singletonList("h1"), http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml index dfc2878..3f4a9d4 100644 --- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml +++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml @@ -333,7 +333,27 @@ a normal table and would return items in rowkey order for scans </description> </property> - + <property> + <name>phoenix.coprocessor.maxServerCacheTimeToLiveMs</name> + <value>60000</value> + <description> + Maximum living time (in milliseconds) of server caches. A cache entry + expires after this amount of time has passed since last access. Consider + adjusting this parameter when a server-side IOException( + âCould not find hash cache for joinIdâ) happens. Getting warnings like + âEarlier hash cache(s) might have expired on serversâ might also be a + sign that this number should be increased. + </description> + </property> + <property> + <name>phoenix.coprocessor.maxMetaDataCacheSize</name> + <value>20480000</value> + <description> + Max size in bytes of total server-side metadata cache after which + evictions will begin to occur based on least recent access time. + Default is 20Mb + </description> + </property> <property> <name>dfs.client.read.shortcircuit</name> <value>true</value> http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py index 68522bf..7181122 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py @@ -480,11 +480,15 @@ class HDP206StackAdvisor(DefaultStackAdvisor): putAmsHbaseSiteProperty("hbase.regionserver.global.memstore.lowerLimit", 0.25) putAmsHbaseSiteProperty("phoenix.query.maxGlobalMemoryPercentage", 20) putTimelineServiceProperty("phoenix.query.maxGlobalMemoryPercentage", 30) + putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 81920000) elif total_sinks_count >= 500: putAmsHbaseSiteProperty("hbase.regionserver.handler.count", 60) putAmsHbaseSiteProperty("hbase.regionserver.hlog.blocksize", 134217728) putAmsHbaseSiteProperty("hbase.regionserver.maxlogs", 64) putAmsHbaseSiteProperty("hbase.hregion.memstore.flush.size", 268435456) + putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 40960000) + else: + putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 20480000) pass # Distributed mode heap size http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py index b38af78..a5a588f 100644 --- a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py @@ -1999,6 +1999,7 @@ class TestHDP22StackAdvisor(TestCase): }, "ams-hbase-site": { "properties": { + "phoenix.coprocessor.maxMetaDataCacheSize": "20480000", "hbase.regionserver.global.memstore.lowerLimit": "0.3", "hbase.regionserver.global.memstore.upperLimit": "0.35", "hbase.hregion.memstore.flush.size": "134217728",
