BIND 10 master, updated. 13f108b25fbccc56b731bd5bcc505cdf48e91e91 Merge branch 'trac1342'
BIND 10 source code commits
bind10-changes at lists.isc.org
Mon Nov 21 09:35:39 UTC 2011
The branch, master has been updated
via 13f108b25fbccc56b731bd5bcc505cdf48e91e91 (commit)
via 295732d42d2b0a9641edfa352087033d8eff2794 (commit)
via 466a968426ed9062d86239560492edf7dc72ee02 (commit)
via a59f28758abdb92721e010956bd421148643377b (commit)
via e09910d37b783b182ae2dc83f6cb272bff68cbb6 (commit)
via 648a187c5d7181019dc19531a1057bc3e6f70e96 (commit)
via 16b7feca0339f67acae30eb67d913bd3ef0298be (commit)
from fc0a31681f7a8e4198068be0038eb9a4f8a74ec7 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 13f108b25fbccc56b731bd5bcc505cdf48e91e91
Merge: fc0a31681f7a8e4198068be0038eb9a4f8a74ec7 295732d42d2b0a9641edfa352087033d8eff2794
Author: Jelte Jansen <jelte at isc.org>
Date: Mon Nov 21 10:15:58 2011 +0100
Merge branch 'trac1342'
-----------------------------------------------------------------------
Summary of changes:
src/bin/bind10/bind10_src.py.in | 105 ++++---------------
src/bin/bind10/tests/bind10_test.py.in | 2 -
src/lib/python/isc/bind10/component.py | 56 +++++++++-
src/lib/python/isc/bind10/tests/component_test.py | 119 +++++++++++++++++----
4 files changed, 174 insertions(+), 108 deletions(-)
-----------------------------------------------------------------------
diff --git a/src/bin/bind10/bind10_src.py.in b/src/bin/bind10/bind10_src.py.in
index 4b169c8..bf6079e 100755
--- a/src/bin/bind10/bind10_src.py.in
+++ b/src/bin/bind10/bind10_src.py.in
@@ -92,51 +92,6 @@ VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
# This is for boot_time of Boss
_BASETIME = time.gmtime()
-class RestartSchedule:
- """
-Keeps state when restarting something (in this case, a process).
-
-When a process dies unexpectedly, we need to restart it. However, if
-it fails to restart for some reason, then we should not simply keep
-restarting it at high speed.
-
-A more sophisticated algorithm can be developed, but for now we choose
-a simple set of rules:
-
- * If a process was been running for >=10 seconds, we restart it
- right away.
- * If a process was running for <10 seconds, we wait until 10 seconds
- after it was started.
-
-To avoid programs getting into lockstep, we use a normal distribution
-to avoid being restarted at exactly 10 seconds."""
-
- def __init__(self, restart_frequency=10.0):
- self.restart_frequency = restart_frequency
- self.run_start_time = None
- self.run_stop_time = None
- self.restart_time = None
-
- def set_run_start_time(self, when=None):
- if when is None:
- when = time.time()
- self.run_start_time = when
- sigma = self.restart_frequency * 0.05
- self.restart_time = when + random.normalvariate(self.restart_frequency,
- sigma)
-
- def set_run_stop_time(self, when=None):
- """We don't actually do anything with stop time now, but it
- might be useful for future algorithms."""
- if when is None:
- when = time.time()
- self.run_stop_time = when
-
- def get_restart_time(self, when=None):
- if when is None:
- when = time.time()
- return max(when, self.restart_time)
-
class ProcessInfoError(Exception): pass
class ProcessInfo:
@@ -151,7 +106,6 @@ class ProcessInfo:
self.env = env
self.dev_null_stdout = dev_null_stdout
self.dev_null_stderr = dev_null_stderr
- self.restart_schedule = RestartSchedule()
self.uid = uid
self.username = username
self.process = None
@@ -200,7 +154,6 @@ class ProcessInfo:
env=spawn_env,
preexec_fn=self._preexec_work)
self.pid = self.process.pid
- self.restart_schedule.set_run_start_time()
# spawn() and respawn() are the same for now, but in the future they
# may have different functionality
@@ -240,8 +193,6 @@ class BoB:
self.cc_session = None
self.ccs = None
self.curproc = None
- # XXX: Not used now, waits for reintroduction of restarts.
- self.dead_processes = {}
self.msgq_socket_file = msgq_socket_file
self.nocache = nocache
self.component_config = {}
@@ -250,6 +201,9 @@ class BoB:
# inapropriate. But as the code isn't probably completely ready
# for it, we leave it at components for now.
self.components = {}
+ # Simply list of components that died and need to wait for a
+ # restart. Components manage their own restart schedule now
+ self.components_to_restart = []
self.runnable = False
self.uid = setuid
self.username = username
@@ -821,7 +775,11 @@ class BoB:
# Tell it it failed. But only if it matters (we are
# not shutting down and the component considers itself
# to be running.
- component.failed(exit_status);
+ component_restarted = component.failed(exit_status);
+ # if the process wants to be restarted, but not just yet,
+ # it returns False
+ if not component_restarted:
+ self.components_to_restart.append(component)
else:
logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
@@ -837,39 +795,22 @@ class BoB:
timeout value.
"""
- # TODO: This is an artefact of previous way of handling processes. The
- # restart queue is currently empty at all times, so this returns None
- # every time it is called (thought is a relict that is obviously wrong,
- # it is called and it doesn't hurt).
- #
- # It is preserved for archeological reasons for the time when we return
- # the delayed restarts, most of it might be useful then (or, if it is
- # found useless, removed).
- next_restart = None
- # if we're shutting down, then don't restart
if not self.runnable:
return 0
- # otherwise look through each dead process and try to restart
- still_dead = {}
+ still_dead = []
+ # keep track of the first time we need to check this queue again,
+ # if at all
+ next_restart_time = None
now = time.time()
- for proc_info in self.dead_processes.values():
- restart_time = proc_info.restart_schedule.get_restart_time(now)
- if restart_time > now:
- if (next_restart is None) or (next_restart > restart_time):
- next_restart = restart_time
- still_dead[proc_info.pid] = proc_info
- else:
- logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
- try:
- proc_info.respawn()
- self.components[proc_info.pid] = proc_info
- logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
- except:
- still_dead[proc_info.pid] = proc_info
- # remember any processes that refuse to be resurrected
- self.dead_processes = still_dead
- # return the time when the next process is ready to be restarted
- return next_restart
+ for component in self.components_to_restart:
+ if not component.restart(now):
+ still_dead.append(component)
+ if next_restart_time is None or\
+ next_restart_time > component.get_restart_time():
+ next_restart_time = component.get_restart_time()
+ self.components_to_restart = still_dead
+
+ return next_restart_time
# global variables, needed for signal handlers
options = None
@@ -1052,10 +993,6 @@ def main():
while boss_of_bind.runnable:
# clean up any processes that exited
boss_of_bind.reap_children()
- # XXX: As we don't put anything into the processes to be restarted,
- # this is really a complicated NOP. But we will try to reintroduce
- # delayed restarts, so it stays here for now, until we find out if
- # it's useful.
next_restart = boss_of_bind.restart_processes()
if next_restart is None:
wait_time = None
diff --git a/src/bin/bind10/tests/bind10_test.py.in b/src/bin/bind10/tests/bind10_test.py.in
index c320849..db68b35 100644
--- a/src/bin/bind10/tests/bind10_test.py.in
+++ b/src/bin/bind10/tests/bind10_test.py.in
@@ -105,7 +105,6 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.components, {})
- self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
self.assertEqual(bob.username, None)
@@ -118,7 +117,6 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.components, {})
- self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
self.assertEqual(bob.username, None)
diff --git a/src/lib/python/isc/bind10/component.py b/src/lib/python/isc/bind10/component.py
index 248bd3b..e4fb3ec 100644
--- a/src/lib/python/isc/bind10/component.py
+++ b/src/lib/python/isc/bind10/component.py
@@ -39,6 +39,7 @@ START_CMD = 'start'
STOP_CMD = 'stop'
STARTED_OK_TIME = 10
+COMPONENT_RESTART_DELAY = 10
STATE_DEAD = 'dead'
STATE_STOPPED = 'stopped'
@@ -99,11 +100,18 @@ class BaseComponent:
but it is vital part of the service (like auth server). If
it fails to start or crashes in less than 10s after the first
startup, the system is brought down. If it crashes later on,
- it is restarted.
+ it is restarted (see below).
* 'dispensable' means the component should be running, but if it
doesn't start or crashes for some reason, the system simply tries
to restart it and keeps running.
+ For components that are restarted, the restarts are not always
+ immediate; if the component has run for more than
+ COMPONENT_RESTART_DELAY (10) seconds, they are restarted right
+ away. If the component has not run that long, the system waits
+ until that time has passed (since the last start) until the
+ component is restarted.
+
Note that the __init__ method of child class should have these
parameters:
@@ -134,6 +142,7 @@ class BaseComponent:
self.__state = STATE_STOPPED
self._kind = kind
self._boss = boss
+ self._original_start_time = None
def start(self):
"""
@@ -149,6 +158,9 @@ class BaseComponent:
logger.info(BIND10_COMPONENT_START, self.name())
self.__state = STATE_RUNNING
self.__start_time = time.time()
+ if self._original_start_time is None:
+ self._original_start_time = self.__start_time
+ self._restart_time = None
try:
self._start_internal()
except Exception as e:
@@ -188,6 +200,11 @@ class BaseComponent:
The exit code is used for logging. It might be None.
It calls _failed_internal internally.
+
+ Returns True if the process was immediately restarted, returns
+ False is the process was not restarted, either because
+ it is considered a core or needed component, or because
+ the component is to be restarted later.
"""
logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
exit_code if exit_code is not None else "unknown")
@@ -199,14 +216,47 @@ class BaseComponent:
# (including it stopped really soon)
if self._kind == 'core' or \
(self._kind == 'needed' and time.time() - STARTED_OK_TIME <
- self.__start_time):
+ self._original_start_time):
self.__state = STATE_DEAD
logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
self._boss.component_shutdown(1)
+ return False
# This means we want to restart
else:
- logger.warn(BIND10_COMPONENT_RESTART, self.name())
+ # if the component was only running for a short time, don't
+ # restart right away, but set a time it wants to restarted,
+ # and return that it wants to be restarted later
+ self.set_restart_time()
+ return self.restart()
+
+ def set_restart_time(self):
+ """Calculates and sets the time this component should be restarted.
+ Currently, it uses a very basic algorithm; start time +
+ RESTART_DELAY (10 seconds). This algorithm may be improved upon
+ in the future.
+ """
+ self._restart_at = self.__start_time + COMPONENT_RESTART_DELAY
+
+ def get_restart_time(self):
+ """Returns the time at which this component should be restarted."""
+ return self._restart_at
+
+ def restart(self, now = None):
+ """Restarts the component if it has a restart_time and if the value
+ of the restart_time is smaller than 'now'.
+
+ If the parameter 'now' is given, its value will be used instead
+ of calling time.time().
+
+ Returns True if the component is restarted, False if not."""
+ if now is None:
+ now = time.time()
+ if self.get_restart_time() is not None and\
+ self.get_restart_time() < now:
self.start()
+ return True
+ else:
+ return False
def running(self):
"""
diff --git a/src/lib/python/isc/bind10/tests/component_test.py b/src/lib/python/isc/bind10/tests/component_test.py
index 15fa470..6bf9e58 100644
--- a/src/lib/python/isc/bind10/tests/component_test.py
+++ b/src/lib/python/isc/bind10/tests/component_test.py
@@ -221,11 +221,6 @@ class ComponentTests(BossUtils, unittest.TestCase):
"""
Check the component restarted successfully.
- Currently, it is implemented as starting it again right away. This will
- change, it will register itself into the restart schedule in boss. But
- as the integration with boss is not clear yet, we don't know how
- exactly that will happen.
-
Reset the self.__start_called to False before calling the function when
the component should fail.
"""
@@ -237,6 +232,16 @@ class ComponentTests(BossUtils, unittest.TestCase):
# Check it can't be started again
self.assertRaises(ValueError, component.start)
+ def __check_not_restarted(self, component):
+ """
+ Check the component has not (yet) restarted successfully.
+ """
+ self.assertFalse(self._shutdown)
+ self.assertTrue(self.__start_called)
+ self.assertFalse(self.__stop_called)
+ self.assertTrue(self.__failed_called)
+ self.assertFalse(component.running())
+
def __do_start_stop(self, kind):
"""
This is a body of a test. It creates a component of given kind,
@@ -296,7 +301,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
component.start()
self.__check_started(component)
# Pretend the component died
- component.failed(1)
+ restarted = component.failed(1)
+ # Since it is a core component, it should not be restarted
+ self.assertFalse(restarted)
# It should bring down the whole server
self.__check_dead(component)
@@ -312,7 +319,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
self.__check_started(component)
self._timeskip()
# Pretend the component died some time later
- component.failed(1)
+ restarted = component.failed(1)
+ # Should not be restarted
+ self.assertFalse(restarted)
# Check the component is still dead
self.__check_dead(component)
@@ -328,7 +337,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
component.start()
self.__check_started(component)
# Make it fail right away.
- component.failed(1)
+ restarted = component.failed(1)
+ # Should not have restarted
+ self.assertFalse(restarted)
self.__check_dead(component)
def test_start_fail_needed_later(self):
@@ -344,37 +355,65 @@ class ComponentTests(BossUtils, unittest.TestCase):
# Make it fail later on
self.__start_called = False
self._timeskip()
- component.failed(1)
+ restarted = component.failed(1)
+ # Should have restarted
+ self.assertTrue(restarted)
self.__check_restarted(component)
def test_start_fail_dispensable(self):
"""
- Start and then fail a dispensable component. Should just get restarted.
+ Start and then fail a dispensable component. Should not get restarted.
"""
# Just ordinary startup
- component = self.__create_component('needed')
+ component = self.__create_component('dispensable')
self.__check_startup(component)
component.start()
self.__check_started(component)
# Make it fail right away
- self.__start_called = False
- component.failed(1)
- self.__check_restarted(component)
+ restarted = component.failed(1)
+ # Should signal that it did not restart
+ self.assertFalse(restarted)
+ self.__check_not_restarted(component)
- def test_start_fail_dispensable(self):
+ def test_start_fail_dispensable_later(self):
"""
Start and then later on fail a dispensable component. Should just get
restarted.
"""
# Just ordinary startup
- component = self.__create_component('needed')
+ component = self.__create_component('dispensable')
self.__check_startup(component)
component.start()
self.__check_started(component)
# Make it fail later on
- self.__start_called = False
self._timeskip()
- component.failed(1)
+ restarted = component.failed(1)
+ # should signal that it restarted
+ self.assertTrue(restarted)
+ # and check if it really did
+ self.__check_restarted(component)
+
+ def test_start_fail_dispensable_restart_later(self):
+ """
+ Start and then fail a dispensable component, wait a bit and try to
+ restart. Should get restarted after the wait.
+ """
+ # Just ordinary startup
+ component = self.__create_component('dispensable')
+ self.__check_startup(component)
+ component.start()
+ self.__check_started(component)
+ # Make it fail immediately
+ restarted = component.failed(1)
+ # should signal that it did not restart
+ self.assertFalse(restarted)
+ self.__check_not_restarted(component)
+ self._timeskip()
+ # try to restart again
+ restarted = component.restart()
+ # should signal that it restarted
+ self.assertTrue(restarted)
+ # and check if it really did
self.__check_restarted(component)
def test_fail_core(self):
@@ -402,14 +441,56 @@ class ComponentTests(BossUtils, unittest.TestCase):
def test_fail_dispensable(self):
"""
Failure to start a dispensable component. The exception should get
- through, but it should be restarted.
+ through, but it should be restarted after a time skip.
"""
component = self.__create_component('dispensable')
self.__check_startup(component)
component._start_internal = self.__fail_to_start
self.assertRaises(TestError, component.start)
+ # tell it to see if it must restart
+ restarted = component.restart()
+ # should not have restarted yet
+ self.assertFalse(restarted)
+ self.__check_not_restarted(component)
+ self._timeskip()
+ # tell it to see if it must restart and do so, with our vision of time
+ restarted = component.restart()
+ # should have restarted now
+ self.assertTrue(restarted)
+ self.__check_restarted(component)
+
+ def test_component_start_time(self):
+ """
+ Check that original start time is set initially, and remains the same
+ after a restart, while the internal __start_time does change
+ """
+ # Just ordinary startup
+ component = self.__create_component('dispensable')
+ self.__check_startup(component)
+ self.assertIsNone(component._original_start_time)
+ component.start()
+ self.__check_started(component)
+
+ self.assertIsNotNone(component._original_start_time)
+ self.assertIsNotNone(component._BaseComponent__start_time)
+ original_start_time = component._original_start_time
+ start_time = component._BaseComponent__start_time
+ # Not restarted yet, so they should be the same
+ self.assertEqual(original_start_time, start_time)
+
+ self._timeskip()
+ # Make it fail
+ restarted = component.failed(1)
+ # should signal that it restarted
+ self.assertTrue(restarted)
+ # and check if it really did
self.__check_restarted(component)
+ # original start time should not have changed
+ self.assertEqual(original_start_time, component._original_start_time)
+ # but actual start time should
+ self.assertNotEqual(start_time, component._BaseComponent__start_time)
+
def test_bad_kind(self):
"""
Test the component rejects nonsensical kinds. This includes bad
More information about the bind10-changes
mailing list