In my first mail I had the patch attached but I got the message "A non-text 
attachment was scrubbed..." so that's why afterwards I inserted the lines into 
the mail message.
So now, I made the patch with git format-patch. I will still insert the patch 
in the message and also add it as attachment.
Please excuse me for any inconvenience, but it's my first patch for the 
open-source community.

>From 4d0c7dc439430f592e5c6251d91e162d2191277d Mon Sep 17 00:00:00 2001
From: Cristian Patrascu <[email protected]>
Date: Tue, 5 Jul 2011 11:11:34 +0300
Subject: [PATCH] systemd restart retries on failure

---
 src/dbus-service.c  |    4 ++++
 src/load-fragment.c |    1 +
 src/service.c       |   50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/service.h       |    3 +++
 src/unit.c          |    4 ++++
 5 files changed, 61 insertions(+), 1 deletions(-)

diff --git a/src/dbus-service.c b/src/dbus-service.c
index 3486623..40e1efe 100644
--- a/src/dbus-service.c
+++ b/src/dbus-service.c
@@ -42,6 +42,8 @@
         "  <property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n"   \
         "  <property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
         "  <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
+        "  <property name=\"MaxRestartRetries\" type=\"u\" 
access=\"read\"/>\n" \
+        "  <property name=\"RestartRetry\" type=\"u\" access=\"read\"/>\n" \
         "  <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
         BUS_EXEC_COMMAND_INTERFACE("ExecStartPre")                      \
         BUS_EXEC_COMMAND_INTERFACE("ExecStart")                         \
@@ -103,6 +105,8 @@ DBusHandlerResult bus_service_message_handler(Unit *u, 
DBusConnection *connectio
                 { "org.freedesktop.systemd1.Service", "PIDFile",               
 bus_property_append_string, "s", u->service.pid_file                   },
                 { "org.freedesktop.systemd1.Service", "NotifyAccess",          
 bus_service_append_notify_access, "s", &u->service.notify_access       },
                 { "org.freedesktop.systemd1.Service", "RestartUSec",           
 bus_property_append_usec,   "t", &u->service.restart_usec              },
+                { "org.freedesktop.systemd1.Service", "MaxRestartRetries",     
 bus_property_append_unsigned, "u", &u->service.restart_retries           },
+                { "org.freedesktop.systemd1.Service", "RestartRetry",          
 bus_property_append_unsigned, "u", &u->service.restart_current_retry         },
                 { "org.freedesktop.systemd1.Service", "TimeoutUSec",           
 bus_property_append_usec,   "t", &u->service.timeout_usec              },
                 BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service", 
u->service.exec_command[SERVICE_EXEC_START_PRE],  "ExecStartPre"),
                 BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service", 
u->service.exec_command[SERVICE_EXEC_START],      "ExecStart"),
diff --git a/src/load-fragment.c b/src/load-fragment.c
index 56eaed9..7e9efaa 100644
--- a/src/load-fragment.c
+++ b/src/load-fragment.c
@@ -1951,6 +1951,7 @@ static int load_from_path(Unit *u, const char *path) {
                 { "TimeoutSec",             config_parse_usec,            0, 
&u->service.timeout_usec,                        "Service" },
                 { "Type",                   config_parse_service_type,    0, 
&u->service.type,                                "Service" },
                 { "Restart",                config_parse_service_restart, 0, 
&u->service.restart,                             "Service" },
+                { "MaxRestartRetries",      config_parse_unsigned,        0, 
&u->service.restart_retries,                     "Service" },
                 { "PermissionsStartOnly",   config_parse_bool,            0, 
&u->service.permissions_start_only,              "Service" },
                 { "RootDirectoryStartOnly", config_parse_bool,            0, 
&u->service.root_directory_start_only,           "Service" },
                 { "RemainAfterExit",        config_parse_bool,            0, 
&u->service.remain_after_exit,                   "Service" },
diff --git a/src/service.c b/src/service.c
index d59c4cb..b5159a6 100644
--- a/src/service.c
+++ b/src/service.c
@@ -1151,6 +1151,20 @@ static int service_load(Unit *u) {
                 if (s->meta.default_dependencies)
                         if ((r = service_add_default_dependencies(s)) < 0)
                                 return r;
+
+                /* If the option "RestartRetries=" is set then the service 
will be "revived" */
+                if (s->restart_retries > 0) {
+                        /* If the option "Restart=" is not set, then for the 
service to be "revived"
+                           we must set "Restart=" to "on-failure" */
+                        if (s->restart == SERVICE_RESTART_NO)
+                                s->restart = SERVICE_RESTART_ON_FAILURE;
+
+                        /* If the option "Restart=" is already set, then for 
the service to be "revived"
+                           and be restarted on failure we must set "Restart=" 
to "always"
+                           only if it's "on-success" */
+                        else if (s->restart == SERVICE_RESTART_ON_SUCCESS)
+                                s->restart = SERVICE_RESTART_ALWAYS;
+                }
         }
 
         return service_verify(s);
@@ -1488,6 +1502,16 @@ static void service_set_state(Service *s, ServiceState 
state) {
                 log_debug("%s changed %s -> %s", s->meta.id, 
service_state_to_string(old_state), service_state_to_string(state));
 
         unit_notify(UNIT(s), state_translation_table[old_state], 
state_translation_table[state], !s->reload_failure);
+
+        /* If service finished, reset restart-retry counter, if applicable */
+        if (old_state != state &&
+            (state == SERVICE_DEAD || state == SERVICE_FAILED) &&
+            s->restart_current_retry > s->restart_retries) {
+                    log_debug("Service %s entered dead or failed, restart 
retries at max, reset counter (%u -> 0)!",
+                              s->meta.id, s->restart_current_retry);
+                    s->restart_current_retry = 0;
+        }
+
         s->reload_failure = false;
 }
 
@@ -2291,6 +2315,7 @@ static int service_start(Unit *u) {
 
         /* Make sure we don't enter a busy loop of some kind. */
         if (!ratelimit_test(&s->ratelimit)) {
+                s->restart_current_retry = 0;
                 log_warning("%s start request repeated too quickly, refusing 
to start.", u->meta.id);
                 return -ECANCELED;
         }
@@ -2311,6 +2336,7 @@ static int service_stop(Unit *u) {
 
         /* This is a user request, so don't do restarts on this
          * shutdown. */
+        s->restart_current_retry = 0;
         s->forbid_restart = true;
 
         /* Already on it */
@@ -2411,6 +2437,8 @@ static int service_serialize(Unit *u, FILE *f, FDSet 
*fds) {
                 }
         }
 
+        unit_serialize_item_format(u, f, "restart-current-retry", "%u", 
s->restart_current_retry);
+
         return 0;
 }
 
@@ -2510,6 +2538,13 @@ static int service_deserialize_item(Unit *u, const char 
*key, const char *value,
                 dual_timestamp_deserialize(value, 
&s->main_exec_status.start_timestamp);
         else if (streq(key, "main-exec-status-exit"))
                 dual_timestamp_deserialize(value, 
&s->main_exec_status.exit_timestamp);
+        else if (streq(key, "restart-current-retry")) {
+                unsigned i;
+                if (safe_atou(value, &i) < 0)
+                        log_debug("Failed to parse restart-current-retry %s", 
value);
+                else
+                        s->restart_current_retry = i;
+        }
         else
                 log_debug("Unknown serialization key '%s'", key);
 
@@ -2604,6 +2639,11 @@ static void service_sigchld_event(Unit *u, pid_t pid, 
int code, int status) {
                          * gone. */
                         s->main_command = NULL;
 
+                        if (success && s->restart_current_retry) {
+                                log_debug("%s changed restart_crt_retry from 
%u to 0", s->meta.id, s->restart_current_retry);
+                                s->restart_current_retry = 0;
+                        }
+
                         switch (s->state) {
 
                         case SERVICE_START_POST:
@@ -2832,7 +2872,15 @@ static void service_timer_event(Unit *u, uint64_t 
elapsed, Watch* w) {
                 break;
 
         case SERVICE_AUTO_RESTART:
-                log_info("%s holdoff time over, scheduling restart.", 
u->meta.id);
+                if (s->restart_retries > 0) {
+                        s->restart_current_retry ++;
+                        if (s->restart_current_retry > s->restart_retries) {
+                                log_warning("%s maximum number of restarting 
retries reached. Stopping.", s->meta.id);
+                                service_enter_dead(s, true, false);
+                                break;
+                        }
+                }
+                log_info("%s holdoff time over, scheduling restart (retry %u 
of %u).", u->meta.id, s->restart_current_retry, s->restart_retries);
                 service_enter_restart(s);
                 break;
 
diff --git a/src/service.h b/src/service.h
index 55b9513..1d6c42c 100644
--- a/src/service.h
+++ b/src/service.h
@@ -92,6 +92,9 @@ struct Service {
         ServiceType type;
         ServiceRestart restart;
 
+        unsigned restart_retries;
+        unsigned restart_current_retry;
+
         /* If set we'll read the main daemon PID from this file */
         char *pid_file;
 
diff --git a/src/unit.c b/src/unit.c
index 9bb4e56..2ae63d1 100644
--- a/src/unit.c
+++ b/src/unit.c
@@ -1114,6 +1114,10 @@ void unit_trigger_on_failure(Unit *u) {
         if (set_size(u->meta.dependencies[UNIT_ON_FAILURE]) <= 0)
                 return;
 
+        if (u->meta.type == UNIT_SERVICE &&
+                u->service.restart_current_retry <= u->service.restart_retries)
+                return;
+
         log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
 
         SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
-- 
1.7.0.4


________________________________________
From: Lennart Poettering [[email protected]]
Sent: Monday, July 04, 2011 8:35 PM
To: Patrascu, Cristian
Cc: Zaharia, Adrian; [email protected]
Subject: Re: [systemd-devel] [PATCH] systemd restart patch

On Mon, 04.07.11 18:18, Cristian Patrascu ([email protected]) 
wrote:

>
> Thank you for your observations and interest!
>
> So I made some modifications:
> - changed restart variables to unsigned
> - changed to 0 as default (not -1, also dropped the define)
> - moved counter reset from unit.c to service.c (after unit_notify call)
> - because the "OnFailure" unit is triggered in unit.c, I still left
> the checking before that.
> - updated coding style
> - added restart counter (de)serialization

heya,

Can you please post this patch properly formatted with git's
format-patch command and not with line breaking added by your mailer?
Otherwise I have trouble reviewing and applying your patch!

Thanks,

Lennart

>
> The patch is for the same branch
> (3661ac04b4f2840d3345605aa35963bbde3c469d) as before:
>
> Index: systemd-29/src/dbus-service.c
> ===================================================================
> --- systemd-29.orig/src/dbus-service.c    2011-07-04
> 16:19:37.000000000 +0300
> +++ systemd-29/src/dbus-service.c    2011-07-04 17:07:09.971651549 +0300
> @@ -42,6 +42,8 @@
>          " <property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n"   \
>          " <property name=\"NotifyAccess\" type=\"s\"
> access=\"read\"/>\n" \
>          " <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
> +        " <property name=\"MaxRestartRetries\" type=\"u\"
> access=\"read\"/>\n" \
> +        " <property name=\"RestartRetry\" type=\"u\" access=\"read\"/>\n" \
>          " <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
>          BUS_EXEC_COMMAND_INTERFACE("ExecStartPre")                      \
>          BUS_EXEC_COMMAND_INTERFACE("ExecStart")                         \
> @@ -103,6 +105,8 @@
>                  { "org.freedesktop.systemd1.Service", "PIDFile",
> bus_property_append_string, "s", u->service.pid_file
> },
>                  { "org.freedesktop.systemd1.Service",
> "NotifyAccess",           bus_service_append_notify_access, "s",
> &u->service.notify_access       },
>                  { "org.freedesktop.systemd1.Service",
> "RestartUSec",            bus_property_append_usec,   "t",
> &u->service.restart_usec              },
> +                { "org.freedesktop.systemd1.Service",
> "MaxRestartRetries",      bus_property_append_unsigned, "u",
> &u->service.restart_retries           },
> +                { "org.freedesktop.systemd1.Service",
> "RestartRetry",           bus_property_append_unsigned, "u",
> &u->service.restart_current_retry         },
>                  { "org.freedesktop.systemd1.Service",
> "TimeoutUSec",            bus_property_append_usec,   "t",
> &u->service.timeout_usec              },

> BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service",
> u->service.exec_command[SERVICE_EXEC_START_PRE],  "ExecStartPre"),

> BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service",
> u->service.exec_command[SERVICE_EXEC_START],      "ExecStart"),
> Index: systemd-29/src/load-fragment.c
> ===================================================================
> --- systemd-29.orig/src/load-fragment.c    2011-07-04
> 16:19:37.000000000 +0300
> +++ systemd-29/src/load-fragment.c    2011-07-04 17:07:09.975651196 +0300
> @@ -1951,6 +1951,7 @@
>                  { "TimeoutSec",             config_parse_usec,
> 0, &u->service.timeout_usec,                        "Service" },
>                  { "Type",
> config_parse_service_type,    0, &u->service.type,
> "Service" },
>                  { "Restart",
> config_parse_service_restart, 0, &u->service.restart,
> "Service" },
> +                { "MaxRestartRetries",      config_parse_unsigned,
> 0, &u->service.restart_retries,                     "Service" },
>                  { "PermissionsStartOnly",   config_parse_bool,
> 0, &u->service.permissions_start_only,              "Service" },
>                  { "RootDirectoryStartOnly", config_parse_bool,
> 0, &u->service.root_directory_start_only,           "Service" },
>                  { "RemainAfterExit",        config_parse_bool,
> 0, &u->service.remain_after_exit,                   "Service" },
> Index: systemd-29/src/service.c
> ===================================================================
> --- systemd-29.orig/src/service.c    2011-07-04 16:19:37.000000000 +0300
> +++ systemd-29/src/service.c    2011-07-04 17:14:32.823465057 +0300
> @@ -1151,6 +1166,20 @@
>                  if (s->meta.default_dependencies)
>                          if ((r = service_add_default_dependencies(s)) < 0)
>                                  return r;
> +
> +                /* If the option "RestartRetries=" is set then the
> service will be "revived" */
> +                if (s->restart_retries > 0) {
> +                        /* If the option "Restart=" is not set,
> then for the service to be "revived"
> +                           we must set "Restart=" to "on-failure" */
> +                        if (s->restart == SERVICE_RESTART_NO)
> +                                s->restart = SERVICE_RESTART_ON_FAILURE;
> +
> +                        /* If the option "Restart=" is already set,
> then for the service to be "revived"
> +                           and be restarted on failure we must set
> "Restart=" to "always"
> +                           only if it's "on-success" */
> +                        else if (s->restart == SERVICE_RESTART_ON_SUCCESS)
> +                                s->restart = SERVICE_RESTART_ALWAYS;
> +                }
>          }
>
>          return service_verify(s);
> @@ -1488,6 +1517,16 @@
>                  log_debug("%s changed %s -> %s", s->meta.id,
> service_state_to_string(old_state), service_state_to_string(state));
>
>          unit_notify(UNIT(s), state_translation_table[old_state],
> state_translation_table[state], !s->reload_failure);
> +
> +        /* If service finished, reset restart-retry counter, if
> applicable */
> +        if (old_state != state &&
> +            (state == SERVICE_DEAD || state == SERVICE_FAILED) &&
> +            s->restart_current_retry > s->restart_retries) {
> +                    log_debug("Service %s entered dead or failed,
> restart retries at max, reset counter (%u -> 0)!",
> +                              s->meta.id, s->restart_current_retry);
> +                    s->restart_current_retry = 0;
> +        }
> +
>          s->reload_failure = false;
>  }
>
> @@ -2291,6 +2330,7 @@
>
>          /* Make sure we don't enter a busy loop of some kind. */
>          if (!ratelimit_test(&s->ratelimit)) {
> +                s->restart_current_retry = 0;
>                  log_warning("%s start request repeated too quickly,
> refusing to start.", u->meta.id);
>                  return -ECANCELED;
>          }
> @@ -2311,6 +2351,7 @@
>
>          /* This is a user request, so don't do restarts on this
>           * shutdown. */
> +        s->restart_current_retry = 0;
>          s->forbid_restart = true;
>
>          /* Already on it */
> @@ -2411,6 +2452,8 @@
>                  }
>          }
>
> +        unit_serialize_item_format(u, f, "restart-current-retry",
> "%u", s->restart_current_retry);
> +
>          return 0;
>  }
>
> @@ -2510,6 +2553,13 @@
>                  dual_timestamp_deserialize(value,
> &s->main_exec_status.start_timestamp);
>          else if (streq(key, "main-exec-status-exit"))
>                  dual_timestamp_deserialize(value,
> &s->main_exec_status.exit_timestamp);
> +        else if (streq(key, "restart-current-retry")) {
> +                unsigned i;
> +                if (safe_atou(value, &i) < 0)
> +                        log_debug("Failed to parse
> restart-current-retry %s", value);
> +                else
> +                        s->restart_current_retry = i;
> +        }
>          else
>                  log_debug("Unknown serialization key '%s'", key);
>
> @@ -2604,6 +2654,11 @@
>                           * gone. */
>                          s->main_command = NULL;
>
> +                        if (success && s->restart_current_retry) {
> +                                log_debug("%s changed
> restart_crt_retry from %u to 0", s->meta.id,
> s->restart_current_retry);
> +                                s->restart_current_retry = 0;
> +                        }
> +
>                          switch (s->state) {
>
>                          case SERVICE_START_POST:
> @@ -2832,7 +2887,15 @@
>                  break;
>
>          case SERVICE_AUTO_RESTART:
> -                log_info("%s holdoff time over, scheduling
> restart.", u->meta.id);
> +                if (s->restart_retries > 0) {
> +                        s->restart_current_retry ++;
> +                        if (s->restart_current_retry >
> s->restart_retries) {
> +                                log_warning("%s maximum number of
> restarting retries reached. Stopping.", s->meta.id);
> +                                service_enter_dead(s, true, false);
> +                                break;
> +                        }
> +                }
> +                log_info("%s holdoff time over, scheduling restart
> (retry %u of %u).", u->meta.id, s->restart_current_retry,
> s->restart_retries);
>                  service_enter_restart(s);
>                  break;
>
> Index: systemd-29/src/service.h
> ===================================================================
> --- systemd-29.orig/src/service.h    2011-07-04 16:19:37.000000000 +0300
> +++ systemd-29/src/service.h    2011-07-04 17:07:09.975651196 +0300
> @@ -92,6 +92,9 @@
>          ServiceType type;
>          ServiceRestart restart;
>
> +        unsigned restart_retries;
> +        unsigned restart_current_retry;
> +
>          /* If set we'll read the main daemon PID from this file */
>          char *pid_file;
>
> Index: systemd-29/src/unit.c
> ===================================================================
> --- systemd-29.orig/src/unit.c    2011-07-04 16:19:37.000000000 +0300
> +++ systemd-29/src/unit.c    2011-07-04 17:07:09.979652060 +0300
> @@ -1114,6 +1114,10 @@
>          if (set_size(u->meta.dependencies[UNIT_ON_FAILURE]) <= 0)
>                  return;
>
> +        if (u->meta.type == UNIT_SERVICE &&
> +                u->service.restart_current_retry <=
> u->service.restart_retries)
> +                return;
> +
>          log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
>
>          SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
>
>
> On 07/02/2011 03:21 AM, Lennart Poettering wrote:
> >On Fri, 01.07.11 15:35, Cristian Patrascu ([email protected]) 
> >wrote:
> >
> >Heya!
> >
> >(BTW, got your first post of this patch too, just didn't find the time
> >to review the patch, sorry. It was still in my mail queue however.)
> >
> >>      After "x" restarts have happened, the "OnFailure" unit will start
> >>(if specified) but only after all unsuccessfull restarts (after "x"
> >>restarts reached).
> >Patch looks pretty good. A few comments though.
> >>This patch is made for systemd-29, on master branch with SHA1 ID:
> >>3661ac04b4f2840d3345605aa35963bbde3c469d
> >>
> >>________systemd-restart-on-fail.patch : _______________________________
> >>
> >>Index: systemd-29/src/dbus-service.c
> >>===================================================================
> >>--- systemd-29.orig/src/dbus-service.c      2011-06-22 10:47:14.000000000 
> >>+0300
> >>+++ systemd-29/src/dbus-service.c   2011-06-22 13:48:51.292321742 +0300
> >>@@ -42,6 +42,8 @@
> >>          "<property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n"   \
> >>          "<property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
> >>          "<property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
> >>+        "<property name=\"MaxRestartRetries\" type=\"i\" 
> >>access=\"read\"/>\n" \
> >>+        "<property name=\"RestartRetry\" type=\"i\"
> >>access=\"read\"/>\n" \
> >I Think it would make sense to make both of these unsigned, in order to
> >avoid confusion whether these actually ever can be negative.
> >
> >If I read your patch correctly right now MaxRestartRetries=-1 means that
> >there is no limit on the number of retries. I'd use 0 for that instead
> >(0 is not needed to indicate disabling, since we can indicate that with
> >Restart=no already.)
> >
> >>+        s->restart_retries = DEFAULT_RESTART_RETRIES;
> >>+        s->restart_crt_retry = 0;
> >We generally try not to abbrievate variables unnecessarily, so I'd suggest
> >to call this variable "restart_current_retry" or so.
> >
> >>  #ifdef HAVE_SYSV_COMPAT
> >>          s->sysv_start_priority = -1;
> >>          s->sysv_start_priority_from_rcnd = -1;
> >>@@ -1151,6 +1153,22 @@
> >>                  if (s->meta.default_dependencies)
> >>                          if ((r = service_add_default_dependencies(s))<   
> >> 0)
> >>                                  return r;
> >>+
> >>+                /* If the option "RestartRetries=" is set then the service 
> >>will be "revived" */
> >>+                if (s->restart_retries != DEFAULT_RESTART_RETRIES){
> >>+                        /* If the option "Restart=" is not set, then for 
> >>the service to be "revived"
> >>+                           we must set "Restart=" to "on-failure" */
> >>+                        if (s->restart == SERVICE_RESTART_NO){
> >>+                                s->restart = SERVICE_RESTART_ON_FAILURE;
> >>+                        }
> >Please follow coding style. We generally place no {} brackets around
> >single line blocks.
> >
> >>          case SERVICE_AUTO_RESTART:
> >>-                log_info("%s holdoff time over, scheduling restart.", 
> >>u->meta.id);
> >>+                if (0<= s->restart_retries){
> >Please follow coding style, compare variables with constants not
> >constants with variables. Also, please place spaces before the<= and
> >the {.
> >
> >>+        if (u->meta.type == UNIT_SERVICE&&
> >>+                u->service.restart_crt_retry<= u->service.restart_retries)
> >>+                return;
> >>+
> >Huhm, I wonder if we could fine a better place for this, not sure where 
> >though.
> >
> >>          log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
> >>
> >>          SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
> >>@@ -1245,6 +1249,17 @@
> >>                          log_notice("Unit %s entered failed state.", 
> >> u->meta.id);
> >>                          unit_trigger_on_failure(u);
> >>                  }
> >>+
> >>+                /* When a unit is of type service and has finished,
> >>+                   reset restart-retry counter, if applicable */
> >>+                if (ns != os&&
> >>+                    u->meta.type == UNIT_SERVICE&&
> >>+                    UNIT_IS_INACTIVE_OR_FAILED(ns)&&
> >>+                    u->service.restart_crt_retry>   
> >>u->service.restart_retries ){
> >>+                            log_debug("Unit %s entered inactive or failed, 
> >>restart retries at max, reset counter (%d ->   0)!",
> >>+                                      u->meta.id, 
> >>u->service.restart_crt_retry, u->service.restart_retries);
> >>+                            u->service.restart_crt_retry = 0;
> >Hmm, I think it would be nicer to reset this counter in service.c only, not
> >in the generic code. I'd like to avoid that we access too many
> >service-private fields from generic unit code.
> >
> >>  #include "execute.h"
> >>  #include "condition.h"
> >>
> >>+/* default = infinite retries (= systemd behaviour not patched) */
> >>+#define DEFAULT_RESTART_RETRIES -1
> >>+
> >Hee, if I commit this, then it won't be patched anymore, so the comment
> >should not refer to it being unpatched ;-)
> >
> >Patch looks quite OK in general.
> >
> >Thanks for the work,
> >
> >Lennart


Lennart

--
Lennart Poettering - Red Hat, Inc.
From 4d0c7dc439430f592e5c6251d91e162d2191277d Mon Sep 17 00:00:00 2001
From: Cristian Patrascu <[email protected]>
Date: Tue, 5 Jul 2011 11:11:34 +0300
Subject: [PATCH] systemd restart retries on failure

---
 src/dbus-service.c  |    4 ++++
 src/load-fragment.c |    1 +
 src/service.c       |   50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/service.h       |    3 +++
 src/unit.c          |    4 ++++
 5 files changed, 61 insertions(+), 1 deletions(-)

diff --git a/src/dbus-service.c b/src/dbus-service.c
index 3486623..40e1efe 100644
--- a/src/dbus-service.c
+++ b/src/dbus-service.c
@@ -42,6 +42,8 @@
         "  <property name=\"PIDFile\" type=\"s\" access=\"read\"/>\n"   \
         "  <property name=\"NotifyAccess\" type=\"s\" access=\"read\"/>\n" \
         "  <property name=\"RestartUSec\" type=\"t\" access=\"read\"/>\n" \
+        "  <property name=\"MaxRestartRetries\" type=\"u\" access=\"read\"/>\n" \
+        "  <property name=\"RestartRetry\" type=\"u\" access=\"read\"/>\n" \
         "  <property name=\"TimeoutUSec\" type=\"t\" access=\"read\"/>\n" \
         BUS_EXEC_COMMAND_INTERFACE("ExecStartPre")                      \
         BUS_EXEC_COMMAND_INTERFACE("ExecStart")                         \
@@ -103,6 +105,8 @@ DBusHandlerResult bus_service_message_handler(Unit *u, DBusConnection *connectio
                 { "org.freedesktop.systemd1.Service", "PIDFile",                bus_property_append_string, "s", u->service.pid_file                   },
                 { "org.freedesktop.systemd1.Service", "NotifyAccess",           bus_service_append_notify_access, "s", &u->service.notify_access       },
                 { "org.freedesktop.systemd1.Service", "RestartUSec",            bus_property_append_usec,   "t", &u->service.restart_usec              },
+                { "org.freedesktop.systemd1.Service", "MaxRestartRetries",      bus_property_append_unsigned, "u", &u->service.restart_retries           },
+                { "org.freedesktop.systemd1.Service", "RestartRetry",           bus_property_append_unsigned, "u", &u->service.restart_current_retry         },
                 { "org.freedesktop.systemd1.Service", "TimeoutUSec",            bus_property_append_usec,   "t", &u->service.timeout_usec              },
                 BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service", u->service.exec_command[SERVICE_EXEC_START_PRE],  "ExecStartPre"),
                 BUS_EXEC_COMMAND_PROPERTY("org.freedesktop.systemd1.Service", u->service.exec_command[SERVICE_EXEC_START],      "ExecStart"),
diff --git a/src/load-fragment.c b/src/load-fragment.c
index 56eaed9..7e9efaa 100644
--- a/src/load-fragment.c
+++ b/src/load-fragment.c
@@ -1951,6 +1951,7 @@ static int load_from_path(Unit *u, const char *path) {
                 { "TimeoutSec",             config_parse_usec,            0, &u->service.timeout_usec,                        "Service" },
                 { "Type",                   config_parse_service_type,    0, &u->service.type,                                "Service" },
                 { "Restart",                config_parse_service_restart, 0, &u->service.restart,                             "Service" },
+                { "MaxRestartRetries",      config_parse_unsigned,        0, &u->service.restart_retries,                     "Service" },
                 { "PermissionsStartOnly",   config_parse_bool,            0, &u->service.permissions_start_only,              "Service" },
                 { "RootDirectoryStartOnly", config_parse_bool,            0, &u->service.root_directory_start_only,           "Service" },
                 { "RemainAfterExit",        config_parse_bool,            0, &u->service.remain_after_exit,                   "Service" },
diff --git a/src/service.c b/src/service.c
index d59c4cb..b5159a6 100644
--- a/src/service.c
+++ b/src/service.c
@@ -1151,6 +1151,20 @@ static int service_load(Unit *u) {
                 if (s->meta.default_dependencies)
                         if ((r = service_add_default_dependencies(s)) < 0)
                                 return r;
+
+                /* If the option "RestartRetries=" is set then the service will be "revived" */
+                if (s->restart_retries > 0) {
+                        /* If the option "Restart=" is not set, then for the service to be "revived"
+                           we must set "Restart=" to "on-failure" */
+                        if (s->restart == SERVICE_RESTART_NO)
+                                s->restart = SERVICE_RESTART_ON_FAILURE;
+
+                        /* If the option "Restart=" is already set, then for the service to be "revived"
+                           and be restarted on failure we must set "Restart=" to "always"
+                           only if it's "on-success" */
+                        else if (s->restart == SERVICE_RESTART_ON_SUCCESS)
+                                s->restart = SERVICE_RESTART_ALWAYS;
+                }
         }
 
         return service_verify(s);
@@ -1488,6 +1502,16 @@ static void service_set_state(Service *s, ServiceState state) {
                 log_debug("%s changed %s -> %s", s->meta.id, service_state_to_string(old_state), service_state_to_string(state));
 
         unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], !s->reload_failure);
+
+        /* If service finished, reset restart-retry counter, if applicable */
+        if (old_state != state &&
+            (state == SERVICE_DEAD || state == SERVICE_FAILED) &&
+            s->restart_current_retry > s->restart_retries) {
+                    log_debug("Service %s entered dead or failed, restart retries at max, reset counter (%u -> 0)!",
+                              s->meta.id, s->restart_current_retry);
+                    s->restart_current_retry = 0;
+        }
+
         s->reload_failure = false;
 }
 
@@ -2291,6 +2315,7 @@ static int service_start(Unit *u) {
 
         /* Make sure we don't enter a busy loop of some kind. */
         if (!ratelimit_test(&s->ratelimit)) {
+                s->restart_current_retry = 0;
                 log_warning("%s start request repeated too quickly, refusing to start.", u->meta.id);
                 return -ECANCELED;
         }
@@ -2311,6 +2336,7 @@ static int service_stop(Unit *u) {
 
         /* This is a user request, so don't do restarts on this
          * shutdown. */
+        s->restart_current_retry = 0;
         s->forbid_restart = true;
 
         /* Already on it */
@@ -2411,6 +2437,8 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
                 }
         }
 
+        unit_serialize_item_format(u, f, "restart-current-retry", "%u", s->restart_current_retry);
+
         return 0;
 }
 
@@ -2510,6 +2538,13 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
                 dual_timestamp_deserialize(value, &s->main_exec_status.start_timestamp);
         else if (streq(key, "main-exec-status-exit"))
                 dual_timestamp_deserialize(value, &s->main_exec_status.exit_timestamp);
+        else if (streq(key, "restart-current-retry")) {
+                unsigned i;
+                if (safe_atou(value, &i) < 0)
+                        log_debug("Failed to parse restart-current-retry %s", value);
+                else
+                        s->restart_current_retry = i;
+        }
         else
                 log_debug("Unknown serialization key '%s'", key);
 
@@ -2604,6 +2639,11 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                          * gone. */
                         s->main_command = NULL;
 
+                        if (success && s->restart_current_retry) {
+                                log_debug("%s changed restart_crt_retry from %u to 0", s->meta.id, s->restart_current_retry);
+                                s->restart_current_retry = 0;
+                        }
+
                         switch (s->state) {
 
                         case SERVICE_START_POST:
@@ -2832,7 +2872,15 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
                 break;
 
         case SERVICE_AUTO_RESTART:
-                log_info("%s holdoff time over, scheduling restart.", u->meta.id);
+                if (s->restart_retries > 0) {
+                        s->restart_current_retry ++;
+                        if (s->restart_current_retry > s->restart_retries) {
+                                log_warning("%s maximum number of restarting retries reached. Stopping.", s->meta.id);
+                                service_enter_dead(s, true, false);
+                                break;
+                        }
+                }
+                log_info("%s holdoff time over, scheduling restart (retry %u of %u).", u->meta.id, s->restart_current_retry, s->restart_retries);
                 service_enter_restart(s);
                 break;
 
diff --git a/src/service.h b/src/service.h
index 55b9513..1d6c42c 100644
--- a/src/service.h
+++ b/src/service.h
@@ -92,6 +92,9 @@ struct Service {
         ServiceType type;
         ServiceRestart restart;
 
+        unsigned restart_retries;
+        unsigned restart_current_retry;
+
         /* If set we'll read the main daemon PID from this file */
         char *pid_file;
 
diff --git a/src/unit.c b/src/unit.c
index 9bb4e56..2ae63d1 100644
--- a/src/unit.c
+++ b/src/unit.c
@@ -1114,6 +1114,10 @@ void unit_trigger_on_failure(Unit *u) {
         if (set_size(u->meta.dependencies[UNIT_ON_FAILURE]) <= 0)
                 return;
 
+        if (u->meta.type == UNIT_SERVICE &&
+                u->service.restart_current_retry <= u->service.restart_retries)
+                return;
+
         log_info("Triggering OnFailure= dependencies of %s.", u->meta.id);
 
         SET_FOREACH(other, u->meta.dependencies[UNIT_ON_FAILURE], i) {
-- 
1.7.0.4

_______________________________________________
systemd-devel mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/systemd-devel

Reply via email to