Hi Andrew, Lars,

Andrew Beekhof wrote:
> I'd like to see the while-block from native_color() be a function that
> is called from native_assign_node().
It seems to be too late to filter out the nodes without enough capacity from
native_assign_node(). I wrote a have_enough_capacity() function which is
called from native_choose_node() to achieve that.

> And instead of a limit-utilization option, we'd have
> placement-strategy=(default|utilization|minimal)
Done. And added a "balanced" option as Lars advised.

> 
> Default ::= what we do now
> Utilization ::= what you've implemented
> Minimal ::= what you've implemented _without_ the load balancing we
> currently do.
> 
> (Maybe the names could be improved, but hopefully you get the idea).
> 
> The last one is interesting because it allows us to concentrate
> services on the minimum number of required nodes (and potentially
> power some of the others down).
Done.

Minimal:
Consider the utilization of nodes and resources. While if a resource has
the same score for several available nodes, do _not_ balance the load.
That implies that the resources  will be concentrated to minimal number of 
nodes.

Balanced:
Consider the utilization of nodes and resources. If a resource has
the same score for several available nodes:
* First, balance the load according to the remaining capacity of nodes.
(implemented from compare_capacity())
* If the nodes still have the equal remaining capacity, then balance
the load according to the numbers of resources that the nodes will run.

The strategies are determined mainly from sort_node_weight(), so I changed the
prototypes of some functions a bit.

Please help to review and test it. Any comments and suggestions are welcome:-)

Thanks,
  Yan

-- 
y...@novell.com
Software Engineer
China Server Team, OPS Engineering

Novell, Inc.
Making IT Work As Oneā„¢
diff -r f49a0cab20aa include/crm/msg_xml.h
--- a/include/crm/msg_xml.h	Thu Nov 12 12:18:10 2009 +0100
+++ b/include/crm/msg_xml.h	Fri Nov 13 14:08:16 2009 +0800
@@ -130,6 +130,7 @@
 #define XML_TAG_ATTRS			"attributes"
 #define XML_TAG_PARAMS			"parameters"
 #define XML_TAG_PARAM			"param"
+#define XML_TAG_UTILIZATION		"utilization"
 
 #define XML_TAG_RESOURCE_REF		"resource_ref"
 #define XML_CIB_TAG_RESOURCE	  	"primitive"
diff -r f49a0cab20aa include/crm/pengine/status.h
--- a/include/crm/pengine/status.h	Thu Nov 12 12:18:10 2009 +0100
+++ b/include/crm/pengine/status.h	Fri Nov 13 14:08:16 2009 +0800
@@ -68,6 +68,7 @@
 		char *dc_uuid;
 		node_t *dc_node;
 		const char *stonith_action;
+		const char *placement_strategy;
 
 		unsigned long long flags;
 
@@ -116,6 +117,8 @@
 		
 		GHashTable *attrs;	/* char* => char* */
 		enum node_type type;
+
+		GHashTable *utilization;
 }; 
 
 struct node_s { 
@@ -186,6 +189,7 @@
 
 		GHashTable *meta;	   
 		GHashTable *parameters;
+		GHashTable *utilization;
 
 		GListPtr children;	  /* resource_t* */	
 };
diff -r f49a0cab20aa lib/pengine/common.c
--- a/lib/pengine/common.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/lib/pengine/common.c	Fri Nov 13 14:08:16 2009 +0800
@@ -80,6 +80,24 @@
 	return FALSE;
 }
 
+static gboolean
+check_placement_strategy(const char *value)
+{
+	if(safe_str_eq(value, "default")) {
+		return TRUE;
+
+	} else if(safe_str_eq(value, "utilization")) {
+		return TRUE;
+
+	} else if(safe_str_eq(value, "minimal")) {
+		return TRUE;
+
+	} else if(safe_str_eq(value, "balanced")) {
+		return TRUE;
+	}
+	return FALSE;
+}
+
 pe_cluster_option pe_opts[] = {
 	/* name, old-name, validate, default, description */
 	{ "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum,
@@ -147,6 +165,10 @@
 	{ "node-health-red", NULL, "integer", NULL, "-INFINITY", &check_number,
 	  "The score 'red' translates to in rsc_location constraints",
 	  "Only used when node-health-strategy is set to custom or progressive." },
+
+	/*Placement Strategy*/
+	{ "placement-strategy", NULL, "enum", "default, utilization, minimal, balanced", "default", &check_placement_strategy,
+	  "The strategy to determine resource placement", NULL},
 };
 
 void
diff -r f49a0cab20aa lib/pengine/complex.c
--- a/lib/pengine/complex.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/lib/pengine/complex.c	Fri Nov 13 14:08:16 2009 +0800
@@ -371,6 +371,12 @@
 	if(safe_str_eq(class, "stonith")) {
 	    set_bit_inplace(data_set->flags, pe_flag_have_stonith_resource);
 	}
+
+	(*rsc)->utilization = g_hash_table_new_full(
+		g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str);
+
+	unpack_instance_attributes(data_set->input, (*rsc)->xml, XML_TAG_UTILIZATION, NULL,
+				   (*rsc)->utilization, NULL, FALSE, data_set->now);
 	
 /* 	data_set->resources = g_list_append(data_set->resources, (*rsc)); */
 	return TRUE;
@@ -451,6 +457,9 @@
 	if(rsc->meta != NULL) {
 		g_hash_table_destroy(rsc->meta);
 	}
+	if(rsc->utilization != NULL) {
+		g_hash_table_destroy(rsc->utilization);
+	}
 	if(rsc->parent == NULL && is_set(rsc->flags, pe_rsc_orphan)) {
 		free_xml(rsc->xml);
 	}
diff -r f49a0cab20aa lib/pengine/status.c
--- a/lib/pengine/status.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/lib/pengine/status.c	Fri Nov 13 14:08:16 2009 +0800
@@ -159,6 +159,9 @@
 			if(details->attrs != NULL) {
 				g_hash_table_destroy(details->attrs);
 			}
+			if(details->utilization != NULL) {
+				g_hash_table_destroy(details->utilization);
+			}
 			pe_free_shallow_adv(details->running_rsc, FALSE);
 			pe_free_shallow_adv(details->allocated_rsc, FALSE);
 			crm_free(details);
diff -r f49a0cab20aa lib/pengine/unpack.c
--- a/lib/pengine/unpack.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/lib/pengine/unpack.c	Fri Nov 13 14:08:16 2009 +0800
@@ -165,6 +165,9 @@
 	crm_info("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s",
 		 score2char(node_score_red),score2char(node_score_yellow),
 		 score2char(node_score_green));
+
+	data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy");
+	crm_debug_2("Placement strategy: %s", data_set->placement_strategy);	
 	
 	return TRUE;
 }
@@ -233,6 +236,9 @@
 		new_node->details->attrs        = g_hash_table_new_full(
 			g_str_hash, g_str_equal,
 			g_hash_destroy_str, g_hash_destroy_str);
+		new_node->details->utilization  = g_hash_table_new_full(
+			g_str_hash, g_str_equal,
+			g_hash_destroy_str, g_hash_destroy_str);
 		
 /* 		if(data_set->have_quorum == FALSE */
 /* 		   && data_set->no_quorum_policy == no_quorum_stop) { */
@@ -258,6 +264,10 @@
 		}
 
 		add_node_attrs(xml_obj, new_node, FALSE, data_set);
+		unpack_instance_attributes(
+			data_set->input, xml_obj, XML_TAG_UTILIZATION, NULL,
+			new_node->details->utilization, NULL, FALSE, data_set->now);
+
 		data_set->nodes = g_list_append(data_set->nodes, new_node);    
 		crm_debug_3("Done with node %s",
 			    crm_element_value(xml_obj, XML_ATTR_UNAME));
diff -r f49a0cab20aa pengine/clone.c
--- a/pengine/clone.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/pengine/clone.c	Fri Nov 13 14:08:16 2009 +0800
@@ -26,7 +26,7 @@
 #define VARIANT_CLONE 1
 #include <lib/pengine/variant.h>
 
-gint sort_clone_instance(gconstpointer a, gconstpointer b);
+gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set);
 
 void child_stopping_constraints(
 	clone_variant_data_t *clone_data, 
@@ -65,7 +65,7 @@
 }
 
 
-gint sort_clone_instance(gconstpointer a, gconstpointer b)
+gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set)
 {
 	int level = LOG_DEBUG_3;
 	node_t *node1 = NULL;
@@ -201,8 +201,8 @@
 	    GListPtr list1 = node_list_dup(resource1->allowed_nodes, FALSE, FALSE);
 	    GListPtr list2 = node_list_dup(resource2->allowed_nodes, FALSE, FALSE);
 	    
-	    list1 = g_list_sort(list1, sort_node_weight);
-	    list2 = g_list_sort(list2, sort_node_weight);
+	    list1 = g_list_sort_with_data(list1, sort_node_weight, data_set);
+	    list2 = g_list_sort_with_data(list2, sort_node_weight, data_set);
 	    max = g_list_length(list1);
 	    if(max < g_list_length(list2)) {
 		max = g_list_length(list2);
@@ -275,8 +275,8 @@
 		    constraint->score/INFINITY, FALSE);
 		);    
 
-	    list1 = g_list_sort(list1, sort_node_weight);
-	    list2 = g_list_sort(list2, sort_node_weight);
+	    list1 = g_list_sort_with_data(list1, sort_node_weight, data_set);
+	    list2 = g_list_sort_with_data(list2, sort_node_weight, data_set);
 	    max = g_list_length(list1);
 	    if(max < g_list_length(list2)) {
 		max = g_list_length(list2);
@@ -457,15 +457,15 @@
 		   }
 		);
 	
-	rsc->children = g_list_sort(rsc->children, sort_clone_instance);
+	rsc->children = g_list_sort_with_data(rsc->children, sort_clone_instance, data_set);
 
 	/* count now tracks the number of clones we have allocated */
 	slist_iter(node, node_t, rsc->allowed_nodes, lpc,
 		   node->count = 0;
 		);
 
-	rsc->allowed_nodes = g_list_sort(
-		rsc->allowed_nodes, sort_node_weight);
+	rsc->allowed_nodes = g_list_sort_with_data(
+		rsc->allowed_nodes, sort_node_weight, data_set);
 
 	slist_iter(node, node_t, rsc->allowed_nodes, lpc,
 		   if(can_run_resources(node)) {
diff -r f49a0cab20aa pengine/master.c
--- a/pengine/master.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/pengine/master.c	Fri Nov 13 14:08:16 2009 +0800
@@ -26,7 +26,7 @@
 #define VARIANT_CLONE 1
 #include <lib/pengine/variant.h>
 
-extern gint sort_clone_instance(gconstpointer a, gconstpointer b);
+extern gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set);
 
 extern int master_score(resource_t *rsc, node_t *node, int not_set_value);
 
@@ -227,7 +227,7 @@
 	return NULL;
 }
 
-static gint sort_master_instance(gconstpointer a, gconstpointer b)
+static gint sort_master_instance(gconstpointer a, gconstpointer b, gpointer data_set)
 {
 	int rc;
 	enum rsc_role_e role1 = RSC_ROLE_UNKNOWN;
@@ -254,10 +254,10 @@
 		return 1;
 	}
 	
-	return sort_clone_instance(a, b);
+	return sort_clone_instance(a, b, data_set);
 }
 
-static void master_promotion_order(resource_t *rsc) 
+static void master_promotion_order(resource_t *rsc, pe_working_set_t *data_set) 
 {
     node_t *node = NULL;
     node_t *chosen = NULL;
@@ -340,7 +340,7 @@
 	crm_debug_2("%s: %d", child->id, child->sort_index);
 	);
 
-    rsc->children = g_list_sort(rsc->children, sort_master_instance);
+    rsc->children = g_list_sort_with_data(rsc->children, sort_master_instance, data_set);
 }
 
 int
@@ -591,7 +591,7 @@
 
 	    );
 
-	master_promotion_order(rsc);
+	master_promotion_order(rsc, data_set);
 
 	/* mark the first N as masters */
 	slist_iter(
diff -r f49a0cab20aa pengine/native.c
--- a/pengine/native.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/pengine/native.c	Fri Nov 13 14:08:16 2009 +0800
@@ -75,7 +75,31 @@
 
 
 static gboolean
-native_choose_node(resource_t *rsc)
+have_enough_capacity(node_t *node, resource_t *rsc)
+{
+	GHashTableIter iter;
+	const char *key = NULL;
+	const char *value = NULL;
+	int required = 0;
+	int remaining = 0;
+	int rc = TRUE;
+
+	g_hash_table_iter_init(&iter, rsc->utilization);
+	while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) {
+		required = crm_parse_int(value, "0");
+		remaining = crm_parse_int(g_hash_table_lookup(node->details->utilization, key), "0");
+		
+		if (required > remaining) {
+			crm_debug("Node %s has no enough %s for resource %s: required=%d remaining=%d",
+				node->details->uname, key, rsc->id, required, remaining);
+			rc = FALSE;
+		}
+	}
+	return rc;
+}
+
+static gboolean
+native_choose_node(resource_t *rsc, pe_working_set_t *data_set)
 {
 	/*
 	  1. Sort by weight
@@ -83,12 +107,28 @@
 				   with the fewest resources
 	  3. remove color.chosen_node from all other colors
 	*/
+	int alloc_details = scores_log_level+1;
+
 	GListPtr nodes = NULL;
 	node_t *chosen = NULL;
 
 	int lpc = 0;
 	int multiple = 0;
-	int length = g_list_length(rsc->allowed_nodes);
+	int length = 0;
+
+	if (safe_str_neq(data_set->placement_strategy, "default")) {
+		slist_iter(
+			node, node_t, data_set->nodes, lpc,
+			if (have_enough_capacity(node, rsc) == FALSE) {
+				crm_debug("Resource %s cannot be allocated to node %s: none of enough capacity",
+					rsc->id, node->details->uname);
+				resource_location(rsc, node, -INFINITY, "__limit_utilization_", data_set);
+			}
+	    	    );
+		dump_node_scores(alloc_details, rsc, "Post-utilization", rsc->allowed_nodes);
+	}
+	
+	length = g_list_length(rsc->allowed_nodes);
 
 	if(is_not_set(rsc->flags, pe_rsc_provisional)) {
 		return rsc->allocated_to?TRUE:FALSE;
@@ -98,7 +138,7 @@
 		    rsc->id, length);
 
 	if(rsc->allowed_nodes) {
-	    rsc->allowed_nodes = g_list_sort(rsc->allowed_nodes, sort_node_weight);
+	    rsc->allowed_nodes = g_list_sort_with_data(rsc->allowed_nodes, sort_node_weight, data_set);
 	    nodes = rsc->allowed_nodes;
 	    chosen = g_list_nth_data(nodes, 0);
 
@@ -327,7 +367,7 @@
 	    native_assign_node(rsc, NULL, NULL, TRUE);
 
 	} else if(is_set(rsc->flags, pe_rsc_provisional)
-	   && native_choose_node(rsc) ) {
+	   && native_choose_node(rsc, data_set) ) {
 		crm_debug_3("Allocated resource %s to %s",
 			    rsc->id, rsc->allocated_to->details->uname);
 
diff -r f49a0cab20aa pengine/utils.c
--- a/pengine/utils.c	Thu Nov 12 12:18:10 2009 +0100
+++ b/pengine/utils.c	Fri Nov 13 14:08:16 2009 +0800
@@ -189,17 +189,65 @@
 	return TRUE;
 }
 
+
+/* rc < 0 if 'node1' has more capacity remaining
+ * rc > 0 if 'node1' has less capacity remaining
+ */
+static int
+compare_capacity(const node_t *node1, const node_t *node2)
+{
+	GHashTableIter iter;
+	const char *key = NULL;
+	const char *value = NULL;
+	int node1_capacity = 0;
+	int node2_capacity = 0;
+	int result = 0;
+
+	g_hash_table_iter_init(&iter, node1->details->utilization);
+	while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) {
+		node1_capacity = crm_parse_int(value, "0");
+		node2_capacity = crm_parse_int(g_hash_table_lookup(node2->details->utilization, key), "0");
+
+		if (node1_capacity > node2_capacity) {
+			result += -1;
+		} else if (node1_capacity < node2_capacity) {
+			result += 1;
+		}
+	}
+
+	g_hash_table_iter_init(&iter, node2->details->utilization);
+	while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) {
+		if (g_hash_table_lookup_extended(node1->details->utilization, key, NULL, NULL)) {
+			continue;
+		}
+
+		node1_capacity = 0;
+		node2_capacity = crm_parse_int(value, "0");
+
+		if (node1_capacity > node2_capacity) {
+			result += -1;
+		} else if (node1_capacity < node2_capacity) {
+			result += 1;
+		}
+	}
+
+	return result;
+}
+
 /* return -1 if 'a' is more preferred
  * return  1 if 'b' is more preferred
  */
-gint sort_node_weight(gconstpointer a, gconstpointer b)
+gint sort_node_weight(gconstpointer a, gconstpointer b, gpointer data)
 {
 	int level = LOG_DEBUG_3;
 	const node_t *node1 = (const node_t*)a;
 	const node_t *node2 = (const node_t*)b;
+	const pe_working_set_t *data_set = (const pe_working_set_t*)data;
 
 	int node1_weight = 0;
 	int node2_weight = 0;
+
+	int result = 0;
 	
 	if(a == NULL) { return 1; }
 	if(b == NULL) { return -1; }
@@ -231,6 +279,17 @@
 	do_crm_log_unlikely(level, "%s (%d) == %s (%d) : weight",
 		    node1->details->uname, node1_weight,
 		    node2->details->uname, node2_weight);
+
+	if (safe_str_eq(data_set->placement_strategy, "minimal")) {
+		goto equal;
+	}
+
+	if (safe_str_eq(data_set->placement_strategy, "balanced")) {
+		result = compare_capacity(node1, node2);
+		if (result != 0) {
+			return result;
+		}
+	}
 	
 	/* now try to balance resources across the cluster */
 	if(node1->details->num_resources
@@ -248,10 +307,36 @@
 		return 1;
 	}
 	
+equal:	
 	do_crm_log_unlikely(level, "%s = %s", node1->details->uname, node2->details->uname);
 	return 0;
 }
 
+/* Specify 'allocate' to TRUE when allocating
+ * Otherwise to FALSE when deallocating
+ */
+static void
+calculate_utilization(node_t *node, resource_t *rsc, gboolean allocate)
+{
+	GHashTableIter iter;
+	const char *key = NULL;
+	const char *value = NULL;
+	const char *capacity = NULL;
+	char *remain_capacity = NULL;
+
+	g_hash_table_iter_init(&iter, rsc->utilization);
+	while (g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) {
+		capacity = g_hash_table_lookup(node->details->utilization, key);
+		if (capacity) {
+			if (allocate) {
+				remain_capacity = crm_itoa(crm_parse_int(capacity, "0") - crm_parse_int(value, "0"));
+			} else {
+				remain_capacity = crm_itoa(crm_parse_int(capacity, "0") + crm_parse_int(value, "0"));
+			}
+			g_hash_table_replace(node->details->utilization, crm_strdup(key), remain_capacity);
+		}
+	}
+}
 
 gboolean
 native_assign_node(resource_t *rsc, GListPtr nodes, node_t *chosen, gboolean force)
@@ -284,6 +369,7 @@
 			old->details->allocated_rsc, rsc);
 		old->details->num_resources--;
 		old->count--;
+		calculate_utilization(old, rsc, FALSE);
 	}
 	
 	crm_debug("Assigning %s to %s", chosen->details->uname, rsc->id);
@@ -293,6 +379,7 @@
 	chosen->details->allocated_rsc = g_list_append(chosen->details->allocated_rsc, rsc);
 	chosen->details->num_resources++;
 	chosen->count++;
+	calculate_utilization(chosen, rsc, TRUE);
 
 	return TRUE;
 }
diff -r f49a0cab20aa pengine/utils.h
--- a/pengine/utils.h	Thu Nov 12 12:18:10 2009 +0100
+++ b/pengine/utils.h	Fri Nov 13 14:08:16 2009 +0800
@@ -47,7 +47,7 @@
 extern rsc_to_node_t *generate_location_rule(
 	resource_t *rsc, xmlNode *location_rule, pe_working_set_t *data_set);
 
-extern gint sort_node_weight(gconstpointer a, gconstpointer b);
+extern gint sort_node_weight(gconstpointer a, gconstpointer b, gpointer data_set);
 
 extern gboolean can_run_resources(const node_t *node);
 extern gboolean native_assign_node(resource_t *rsc, GListPtr candidates, node_t *chosen, gboolean force);
diff -r f49a0cab20aa xml/pacemaker.rng.in
--- a/xml/pacemaker.rng.in	Thu Nov 12 12:18:10 2009 +0100
+++ b/xml/pacemaker.rng.in	Fri Nov 13 14:08:16 2009 +0800
@@ -104,9 +104,14 @@
 	    <attribute name="description"><text/></attribute>
 	  </optional>
 	  <zeroOrMore>
-	    <element name="instance_attributes">
-	      <externalRef href="nvs...@crm_dtd_version@.rng"/>
-	    </element>
+	    <choice>
+	      <element name="instance_attributes">
+	        <externalRef href="nvs...@crm_dtd_version@.rng"/>
+	      </element>
+	      <element name="utilization">
+	        <externalRef href="nvs...@crm_dtd_version@.rng"/>
+	      </element>
+	    </choice>
 	  </zeroOrMore>
 	</element>
       </zeroOrMore>
diff -r f49a0cab20aa xml/resources.rng.in
--- a/xml/resources.rng.in	Thu Nov 12 12:18:10 2009 +0100
+++ b/xml/resources.rng.in	Fri Nov 13 14:08:16 2009 +0800
@@ -39,6 +39,11 @@
 	</optional>
 	<ref name="element-resource-extra"/>
 	<ref name="element-operations"/>
+      	<zeroOrMore>
+	  <element name="utilization">
+	    <externalRef href="nvs...@crm_dtd_version@.rng"/>
+	  </element>
+      	</zeroOrMore>
       </interleave>
     </element>
   </define>
_______________________________________________
Pacemaker mailing list
Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Reply via email to