Under overload conditions with thousands of checkpoints, and an exiting
process, it is possible that many simulatenous checkpoint expirations
will result in an assertion.

This patch resolves that issue.

Regards
-steve
Index: exec/ckpt.c
===================================================================
--- exec/ckpt.c	(revision 1641)
+++ exec/ckpt.c	(working copy)
@@ -159,6 +159,7 @@
 
 struct checkpoint {
 	struct list_head list;
+	struct list_head expiry_list;
 	mar_name_t name;
 	mar_uint32_t ckpt_id;
 	mar_ckpt_checkpoint_creation_attributes_t checkpoint_creation_attributes;
@@ -370,6 +371,8 @@
 
 DECLARE_LIST_INIT(checkpoint_recovery_list_head);
 
+DECLARE_LIST_INIT(my_checkpoint_expiry_list_head);
+
 static mar_uint32_t global_ckpt_id = 0;
 
 static enum sync_state my_sync_state;
@@ -386,6 +389,10 @@
 
 static unsigned int my_should_sync = 0;
 
+static unsigned int my_token_callback_active = 0;
+
+static void * my_token_callback_handle;
+
 struct checkpoint_cleanup {
 	struct list_head list;
 	mar_name_t checkpoint_name;
@@ -1017,9 +1024,7 @@
 	iovec.iov_base = (char *)&req_exec_ckpt_checkpointclose;
 	iovec.iov_len = sizeof (req_exec_ckpt_checkpointclose);
 
-	assert (totempg_groups_mcast_joined (openais_group_handle, &iovec, 1, TOTEMPG_AGREED) == 0);
-
-	return (-1);
+	return (totempg_groups_mcast_joined (openais_group_handle, &iovec, 1,			TOTEMPG_AGREED));
 }
 
 static int ckpt_exec_init_fn (struct objdb_iface_ver0 *objdb)
@@ -1261,6 +1266,7 @@
 		checkpoint->unlinked = 0;
 		list_init (&checkpoint->list);
 		list_init (&checkpoint->sections_list_head);
+		list_init (&checkpoint->expiry_list);
 		list_add (&checkpoint->list, &checkpoint_list_head);
 		checkpoint->reference_count = 1;
 		checkpoint->retention_timer = 0;
@@ -1473,31 +1479,69 @@
 
 }
 
-void timer_function_retention (void *data)
+int callback_expiry (enum totem_callback_token_type type, void *data)
 {
 	struct checkpoint *checkpoint = (struct checkpoint *)data;
-	struct req_exec_ckpt_checkpointretentiondurationexpire req_exec_ckpt_checkpointretentiondurationexpire;
+	struct req_exec_ckpt_checkpointunlink req_exec_ckpt_checkpointunlink;
 	struct iovec iovec;
+	unsigned int res;
+	struct list_head *list;
 
-	checkpoint->retention_timer = 0;
-	req_exec_ckpt_checkpointretentiondurationexpire.header.size =
-		sizeof (struct req_exec_ckpt_checkpointretentiondurationexpire);
-	req_exec_ckpt_checkpointretentiondurationexpire.header.id =
-		SERVICE_ID_MAKE (CKPT_SERVICE,
-			MESSAGE_REQ_EXEC_CKPT_CHECKPOINTRETENTIONDURATIONEXPIRE);
+	list = my_checkpoint_expiry_list_head.next;
+	while (!list_empty(&my_checkpoint_expiry_list_head)) {
+		checkpoint = list_entry (list,
+			struct checkpoint, expiry_list);
 
-	memcpy (&req_exec_ckpt_checkpointretentiondurationexpire.checkpoint_name,
-		&checkpoint->name,
-		sizeof (mar_name_t));
-	req_exec_ckpt_checkpointretentiondurationexpire.ckpt_id =
-		checkpoint->ckpt_id;
+		if (checkpoint->reference_count == 0) {
+			req_exec_ckpt_checkpointunlink.header.size =
+				sizeof (struct req_exec_ckpt_checkpointunlink);
+			req_exec_ckpt_checkpointunlink.header.id =
+				SERVICE_ID_MAKE (CKPT_SERVICE,
+					MESSAGE_REQ_EXEC_CKPT_CHECKPOINTUNLINK);
 
-	iovec.iov_base = (char *)&req_exec_ckpt_checkpointretentiondurationexpire;
-	iovec.iov_len = sizeof (req_exec_ckpt_checkpointretentiondurationexpire);
+			req_exec_ckpt_checkpointunlink.source.conn = 0;
+			req_exec_ckpt_checkpointunlink.source.nodeid = 0;
 
-	assert (totempg_groups_mcast_joined (openais_group_handle, &iovec, 1, TOTEMPG_AGREED) == 0);
+			memcpy (&req_exec_ckpt_checkpointunlink.checkpoint_name,
+				&checkpoint->name,
+				sizeof (mar_name_t));
+
+			iovec.iov_base = (char *)&req_exec_ckpt_checkpointunlink;
+			iovec.iov_len = sizeof (req_exec_ckpt_checkpointunlink);
+
+			res = totempg_groups_mcast_joined (openais_group_handle, &iovec, 1, TOTEMPG_AGREED);
+			if (res == -1) {
+				return (-1);
+			}
+			log_printf (LOG_LEVEL_NOTICE,
+				"Expiring checkpoint %s\n",
+				get_mar_name_t (&checkpoint->name));
+		}
+
+		list_del (&checkpoint->expiry_list);
+		list = my_checkpoint_expiry_list_head.next;
+	}
+	my_token_callback_active = 0;
+	return (0);
 }
 
+void timer_function_retention (void *data)
+{
+	struct checkpoint *checkpoint = (struct checkpoint *)data;
+	checkpoint->retention_timer = 0;
+	list_add (&checkpoint->expiry_list, &my_checkpoint_expiry_list_head);
+
+	if (my_token_callback_active == 0) {
+		totempg_callback_token_create (
+			&my_token_callback_handle,
+			TOTEM_CALLBACK_TOKEN_SENT,
+			1,
+			callback_expiry,
+			NULL);
+		my_token_callback_active = 1;
+	}
+}
+
 static void message_handler_req_exec_ckpt_checkpointclose (
 	void *message,
 	unsigned int nodeid)
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to