Repository: ambari Updated Branches: refs/heads/branch-2.4 495ad2a1a -> f6450ab75
AMBARI-17695 : AMS Split point calculation not optimal for large clusters. (avijayan) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/f6450ab7 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/f6450ab7 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/f6450ab7 Branch: refs/heads/branch-2.4 Commit: f6450ab75fc9c55687229c81391f201f5b380267 Parents: 495ad2a Author: Aravindan Vijayan <[email protected]> Authored: Fri Jul 15 13:11:40 2016 -0700 Committer: Aravindan Vijayan <[email protected]> Committed: Fri Jul 15 13:11:40 2016 -0700 ---------------------------------------------------------------------- .../metrics/timeline/PhoenixHBaseAccessor.java | 11 ++++++-- .../server/upgrade/UpgradeCatalog240.java | 9 +++++++ .../0.1.0/configuration/ams-hbase-site.xml | 2 +- .../0.1.0/package/scripts/split_points.py | 19 ++++++------- .../server/upgrade/UpgradeCatalog240Test.java | 3 +++ .../stacks/2.2/common/test_stack_advisor.py | 28 +++++++++++--------- 6 files changed, 48 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java ---------------------------------------------------------------------- diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java index b86f97a..16ebcd9 100644 --- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java +++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java @@ -418,10 +418,17 @@ public class PhoenixHBaseAccessor { precisionSql += getSplitPointsStr(splitPoints); } stmt.executeUpdate(precisionSql); - stmt.executeUpdate(String.format(CREATE_METRICS_AGGREGATE_TABLE_SQL, + + String hostMinuteAggregrateSql = String.format(CREATE_METRICS_AGGREGATE_TABLE_SQL, METRICS_AGGREGATE_MINUTE_TABLE_NAME, encoding, tableTTL.get(METRICS_AGGREGATE_MINUTE_TABLE_NAME), - compression)); + compression); + splitPoints = metricsConf.get(AGGREGATE_TABLE_SPLIT_POINTS); + if (!StringUtils.isEmpty(splitPoints)) { + hostMinuteAggregrateSql += getSplitPointsStr(splitPoints); + } + stmt.executeUpdate(hostMinuteAggregrateSql); + stmt.executeUpdate(String.format(CREATE_METRICS_AGGREGATE_TABLE_SQL, METRICS_AGGREGATE_HOURLY_TABLE_NAME, encoding, tableTTL.get(METRICS_AGGREGATE_HOURLY_TABLE_NAME), http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java index a3e5beb..73d61c6 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/upgrade/UpgradeCatalog240.java @@ -200,6 +200,7 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog { public static final String MAPRED_SITE_QUEUE_NAME = "mapreduce.job.queuename"; private static final String AMS_HBASE_SITE = "ams-hbase-site"; private static final String HBASE_RPC_TIMEOUT_PROPERTY = "hbase.rpc.timeout"; + private static final String AMS_HBASE_SITE_NORMALIZER_ENABLED_PROPERTY = "hbase.normalizer.enabled"; static { // Manually create role order since there really isn't any mechanism for this @@ -1995,6 +1996,14 @@ public class UpgradeCatalog240 extends AbstractUpgradeCatalog { "30000".equals(amsHbaseSiteProperties.get(HBASE_RPC_TIMEOUT_PROPERTY))) { newProperties.put(HBASE_RPC_TIMEOUT_PROPERTY, String.valueOf(300000)); } + + if(amsHbaseSiteProperties.containsKey(AMS_HBASE_SITE_NORMALIZER_ENABLED_PROPERTY) && + "true".equals(amsHbaseSiteProperties.get(AMS_HBASE_SITE_NORMALIZER_ENABLED_PROPERTY))) { + LOG.info("Disabling " + AMS_HBASE_SITE_NORMALIZER_ENABLED_PROPERTY); + newProperties.put(AMS_HBASE_SITE_NORMALIZER_ENABLED_PROPERTY, String.valueOf(false)); + } + + updateConfigurationPropertiesForCluster(cluster, AMS_HBASE_SITE, newProperties, true, true); } http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml index a7d8228..bf62b8e 100644 --- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml +++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml @@ -168,7 +168,7 @@ </property> <property> <name>hbase.normalizer.enabled</name> - <value>true</value> + <value>false</value> <description>If set to true, Master will try to keep region size within each table approximately the same.</description> <on-ambari-upgrade add="true"/> http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/scripts/split_points.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/scripts/split_points.py b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/scripts/split_points.py index fa4deaf..aa03d197 100644 --- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/scripts/split_points.py +++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/scripts/split_points.py @@ -27,7 +27,7 @@ import ast metric_filename_ext = '.txt' # 5 regions for higher order aggregate tables -other_region_static_count = 6 +other_region_static_count = 5 # Max equidistant points to return per service max_equidistant_points = 50 @@ -96,21 +96,22 @@ class FindSplitPointsForAMSRegions(): if self.mode == 'distributed': xmx_bytes = xmx_region_bytes - memstore_max_mem = float(self.ams_hbase_site['hbase.regionserver.global.memstore.lowerLimit']) * xmx_bytes + memstore_max_mem = float(self.ams_hbase_site['hbase.regionserver.global.memstore.upperLimit']) * xmx_bytes memstore_flush_size = format_Xmx_size_to_bytes(self.ams_hbase_site['hbase.hregion.memstore.flush.size']) max_inmemory_regions = (memstore_max_mem / memstore_flush_size) - other_region_static_count print 'max_inmemory_regions: %s' % max_inmemory_regions if max_inmemory_regions > 2: - # Lets say total = 12, so we have 7 regions to allocate between - # METRIC_RECORD and METRIC_AGGREGATE tables, desired = (5, 2) - self.desired_precision_region_count = int(math.floor(0.8 * max_inmemory_regions)) - self.desired_aggregate_region_count = int(max_inmemory_regions - self.desired_precision_region_count) + # Lets say total = 25, so we have 20 regions to allocate between + # METRIC_RECORD, METRIC_AGGREGATE & METRIC_RECORD_MINUTE tables, desired = (14, 3, 3) + # 70 % to METRIC_RECORD + self.desired_precision_region_count = max(2, int(math.floor(0.70 * max_inmemory_regions))) + # 15% each to METRIC_AGGREGATE & METRIC_RECORD_MINUTE + self.desired_aggregate_region_count = max(2, int(math.floor(0.15 * max_inmemory_regions))) else: - self.desired_precision_region_count = 1 - self.desired_aggregate_region_count = 1 - + self.desired_precision_region_count = 2 + self.desired_aggregate_region_count = 2 except: print('Bad config settings, could not calculate max regions available.') pass http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-server/src/test/java/org/apache/ambari/server/upgrade/UpgradeCatalog240Test.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/upgrade/UpgradeCatalog240Test.java b/ambari-server/src/test/java/org/apache/ambari/server/upgrade/UpgradeCatalog240Test.java index a2d6287..a4683ed 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/upgrade/UpgradeCatalog240Test.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/upgrade/UpgradeCatalog240Test.java @@ -55,6 +55,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Random; import javax.persistence.EntityManager; @@ -1387,11 +1388,13 @@ public class UpgradeCatalog240Test { Map<String, String> oldPropertiesAmsHbaseSite = new HashMap<String, String>() { { put("hbase.rpc.timeout", "30000"); + put("hbase.normalizer.enabled", String.valueOf(true)); } }; Map<String, String> newPropertiesAmsHbaseSite = new HashMap<String, String>() { { put("hbase.rpc.timeout", "300000"); + put("hbase.normalizer.enabled", String.valueOf(false)); } }; EasyMockSupport easyMockSupport = new EasyMockSupport(); http://git-wip-us.apache.org/repos/asf/ambari/blob/f6450ab7/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py index cf5918a..c6c27c2 100644 --- a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py +++ b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py @@ -2235,8 +2235,8 @@ class TestHDP22StackAdvisor(TestCase): }, "ams-site": { "properties": { - "timeline.metrics.cluster.aggregate.splitpoints": " ", - "timeline.metrics.host.aggregate.splitpoints": " ", + "timeline.metrics.cluster.aggregate.splitpoints": "master.FileSystem.MetaHlogSplitTime_75th_percentile", + "timeline.metrics.host.aggregate.splitpoints": "master.FileSystem.MetaHlogSplitTime_75th_percentile", "timeline.metrics.host.aggregator.ttl": "86400", "timeline.metrics.service.handler.thread.count": "20", 'timeline.metrics.service.webapp.address': 'host1:6188', @@ -2344,6 +2344,8 @@ class TestHDP22StackAdvisor(TestCase): } ] + expected["ams-site"]['properties']['timeline.metrics.host.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' + expected["ams-site"]['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' expected["ams-hbase-env"]['properties']['hbase_master_heapsize'] = '2432' expected["ams-hbase-env"]['properties']['hbase_master_xmn_size'] = '512' expected["ams-env"]['properties']['metrics_collector_heapsize'] = '640' @@ -2369,11 +2371,11 @@ class TestHDP22StackAdvisor(TestCase): # Embedded mode, 512m master heapsize, no splitpoints recommended services["configurations"]['ams-hbase-env']['properties']['hbase_master_heapsize'] = '512' - services["configurations"]['ams-hbase-site']['properties']['hbase.regionserver.global.memstore.lowerLimit'] = '0.3' + services["configurations"]['ams-hbase-site']['properties']['hbase.regionserver.global.memstore.upperLimit'] = '0.4' services["configurations"]['ams-hbase-site']['properties']['hbase.hregion.memstore.flush.size'] = '134217728' - expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = ' ' - expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = ' ' + expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' + expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' expected['ams-hbase-env']['properties']['hbase_master_heapsize'] = '512' self.stackAdvisor.recommendAmsConfigurations(configurations, clusterData, services, hosts) @@ -2381,9 +2383,11 @@ class TestHDP22StackAdvisor(TestCase): # Embedded mode, 4096m master heapsize, some splitpoints recommended services["configurations"]['ams-hbase-env']['properties']['hbase_master_heapsize'] = '4096' - expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = \ - 'master.Server.numDeadRegionServers' - expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = ' ' + expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = 'dfs.datanode.WriteBlockOpNumOps,' \ + 'mapred.ShuffleMetrics.ShuffleOutputsFailed,' \ + 'read_bps,' \ + 'rpcdetailed.rpcdetailed.GetContainerStatusesAvgTime' + expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' expected['ams-hbase-env']['properties']['hbase_master_heapsize'] = '4096' self.stackAdvisor.recommendAmsConfigurations(configurations, clusterData, services, hosts) self.assertEquals(configurations, expected) @@ -2392,7 +2396,7 @@ class TestHDP22StackAdvisor(TestCase): services["configurations"]['ams-hbase-env']['properties']['hbase_master_heapsize'] = '8192' expected['ams-hbase-env']['properties']['hbase_master_heapsize'] = '8192' self.stackAdvisor.recommendAmsConfigurations(configurations, clusterData, services, hosts) - self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'].split(',')), 9) + self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'].split(',')), 13) self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'].split(',')), 2) # Test splitpoints, AMS distributed mode @@ -2413,8 +2417,8 @@ class TestHDP22StackAdvisor(TestCase): # Distributed mode, low memory, no splitpoints recommended services["configurations"]['ams-hbase-env']['properties']['hbase_regionserver_heapsize'] = '512' - expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = ' ' - expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = ' ' + expected['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' + expected['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'] = 'master.Server.numDeadRegionServers' expected['ams-hbase-env']['properties']['hbase_regionserver_heapsize'] = '512' expected["ams-hbase-env"]['properties']['hbase_master_xmn_size'] = '102' expected['ams-hbase-env']['properties']['regionserver_xmn_size'] = '384' @@ -2427,7 +2431,7 @@ class TestHDP22StackAdvisor(TestCase): services["configurations"]['ams-hbase-env']['properties']['hbase_regionserver_heapsize'] = '8192' expected['ams-hbase-env']['properties']['hbase_regionserver_heapsize'] = '8192' self.stackAdvisor.recommendAmsConfigurations(configurations, clusterData, services, hosts) - self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'].split(',')), 9) + self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.host.aggregate.splitpoints'].split(',')), 13) self.assertEquals(len(configurations['ams-site']['properties']['timeline.metrics.cluster.aggregate.splitpoints'].split(',')), 2) def test_recommendHbaseConfigurations(self):
