[svn] commit: r1233 - in /trunk/src/bin/bind10: TODO bind10.py.in

Tue Mar 9 13:45:00 UTC 2010

Author: shane
Date: Tue Mar  9 13:45:00 2010
New Revision: 1233

Log:
Added back-off when processes terminate too quickly.


Modified:
    trunk/src/bin/bind10/TODO
    trunk/src/bin/bind10/bind10.py.in

Modified: trunk/src/bin/bind10/TODO
==============================================================================

--- trunk/src/bin/bind10/TODO (original)
+++ trunk/src/bin/bind10/TODO Tue Mar  9 13:45:00 2010
@@ -8,7 +8,6 @@
   - Force-stop a component
 - Mechanism to wait for child to start before continuing
 - Way to ask a child to die politely 
-- Back-off mechanism for restarting failed processes
 - Start statistics daemon
 - Statistics interaction (?)
 - Use .spec file to define comands

Modified: trunk/src/bin/bind10/bind10.py.in
==============================================================================
--- trunk/src/bin/bind10/bind10.py.in (original)
+++ trunk/src/bin/bind10/bind10.py.in Tue Mar  9 13:45:00 2010
@@ -3,6 +3,7 @@
 import sys; sys.path.append ('@@PYTHONPATH@@')
 import os
 import time
+import random
 
 """\
 This file implements the Boss of Bind (BoB, or bob) program.
@@ -51,7 +52,7 @@
 import isc
 
 # This is the version that gets displayed to the user.
-__version__ = "v20100308"
+__version__ = "v20100309"
 
 # Nothing at all to do with the 1990-12-10 article here:
 # http://www.subgenius.com/subg-digest/v2/0056.html
@@ -70,7 +71,10 @@
   * If a process was been running for >=10 seconds, we restart it
     right away.
   * If a process was running for <10 seconds, we wait until 10 seconds
-    after it was started."""
+    after it was started.
+
+To avoid programs getting into lockstep, we use a normal distribution
+to avoid being restarted at exactly 10 seconds."""
 
     def __init__(self, restart_frequency=10.0):
         self.restart_frequency = restart_frequency
@@ -82,7 +86,9 @@
         if when is None:
             when = time.time()
         self.run_start_time = when
-        self.restart_time = when + self.restart_frequency
+        sigma = self.restart_frequency * 0.05
+        self.restart_time = when + random.normalvariate(self.restart_frequency, 
+                                                        sigma)
 
     def set_run_stop_time(self, when=None):
         if when is None:
@@ -121,15 +127,15 @@
                                         close_fds=True,
                                         env=spawn_env,)
         self.pid = self.process.pid
+        self.restart_schedule.set_run_start_time()
 
     def __init__(self, name, args, env={}, dev_null_stdout=False):
         self.name = name 
         self.args = args
         self.env = env
         self.dev_null_stdout = dev_null_stdout
+        self.restart_schedule = RestartSchedule()
         self._spawn()
-        self.last_spawn_time = time.time()
-#        self.respawn
 
     def respawn(self):
         self._spawn()
@@ -358,6 +364,7 @@
             if pid == 0: break
             if pid in self.processes:
                 proc_info = self.processes.pop(pid)
+                proc_info.restart_schedule.set_run_stop_time()
                 self.dead_processes[proc_info.pid] = proc_info
                 if self.verbose:
                     sys.stdout.write("Process %s (PID %d) died.\n" % 
@@ -427,26 +434,39 @@
 
     def restart_processes(self):
         """Restart any dead processes."""
-        # XXX: this needs a back-off algorithm
+        next_restart = None
         # if we're shutting down, then don't restart
         if not self.runnable:
-            return
+            return next_restart
         # otherwise look through each dead process and try to restart
         still_dead = {}
+        now = time.time()
         for proc_info in self.dead_processes.values():
-            if self.verbose:
-                sys.stdout.write("Resurrecting dead %s process...\n" % 
-                                 proc_info.name)
-            try:
-                proc_info.respawn()
-                self.processes[proc_info.pid] = proc_info
+            restart_time = proc_info.restart_schedule.get_restart_time(now)
+            if restart_time > now:
+#                if self.verbose:
+#                    sys.stdout.write("Dead %s process waiting %.1f seconds "\
+#                                     "for resurrection\n" % 
+#                                     (proc_info.name, (restart_time-now)))
+                if (next_restart is None) or (next_restart > restart_time):
+                    next_restart = restart_time
+                still_dead[proc_info.pid] = proc_info
+            else:
                 if self.verbose:
-                    sys.stdout.write("Resurrected %s (PID %d)\n" %
-                                     (proc_info.name, proc_info.pid))
-            except:
-                still_dead[proc_info.pid] = proc_info
+                    sys.stdout.write("Resurrecting dead %s process...\n" % 
+                                     proc_info.name)
+                try:
+                    proc_info.respawn()
+                    self.processes[proc_info.pid] = proc_info
+                    if self.verbose:
+                        sys.stdout.write("Resurrected %s (PID %d)\n" %
+                                         (proc_info.name, proc_info.pid))
+                except:
+                    still_dead[proc_info.pid] = proc_info
         # remember any processes that refuse to be resurrected
         self.dead_processes = still_dead
+        # return the time when the next process is ready to be restarted
+        return next_restart
 
 def reaper(signal_number, stack_frame):
     """A child process has died (SIGCHLD received)."""
@@ -525,15 +545,18 @@
     while boss_of_bind.runnable:
         # clean up any processes that exited
         boss_of_bind.reap_children()
-        boss_of_bind.restart_processes()
-
-        # XXX: get time for next restart for timeout
+        next_restart = boss_of_bind.restart_processes()
+        if next_restart is None:
+            wait_time = None
+        else:
+            wait_time = max(next_restart - time.time(), 0)
 
         # select() can raise EINTR when a signal arrives, 
         # even if they are resumable, so we have to catch
         # the exception
         try:
-            (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [])
+            (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [], 
+                                                  wait_time)
         except select.error as err:
             if err.args[0] == errno.EINTR:
                 (rlist, wlist, xlist) = ([], [], [])