BIND 10 trac213-incremental-restarts, updated. e41f8459ca5dbc886e838e6e32585ba5c7eb96e6 [213] Make tests work again

Tue Nov 1 17:42:43 UTC 2011

The branch, trac213-incremental-restarts has been updated
       via  e41f8459ca5dbc886e838e6e32585ba5c7eb96e6 (commit)
       via  e856c49ae33b2b79d8eab0b313e4ba25db261c4a (commit)
       via  3a6d50835b621e4825ec0d8434ce066bd31020d0 (commit)
      from  3a25578a01620918cd722e430b61c0fe91177e0a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit e41f8459ca5dbc886e838e6e32585ba5c7eb96e6
Author: Michal 'vorner' Vaner <michal.vaner at nic.cz>
Date:   Tue Nov 1 18:32:22 2011 +0100

    [213] Make tests work again
    
    The ones for brittle mode are turned off for now, the rest is changed by
    trivial changes.

commit e856c49ae33b2b79d8eab0b313e4ba25db261c4a
Author: Michal 'vorner' Vaner <michal.vaner at nic.cz>
Date:   Tue Nov 1 18:17:26 2011 +0100

    [213] Make get_processes work

commit 3a6d50835b621e4825ec0d8434ce066bd31020d0
Author: Michal 'vorner' Vaner <michal.vaner at nic.cz>
Date:   Tue Nov 1 17:59:37 2011 +0100

    [213] Do the restarts with components
    
    The restart schedule was left in there, as it might turn out to be
    needed soon. We don't do the restarts after a timeout, but right away
    now (should change soon).
    
    The brittle mode is gone for now.
    
    Unit tests not updated yet, but system tests pass.

-----------------------------------------------------------------------

Summary of changes:
 src/bin/bind10/bind10_src.py.in        |   89 +++++++++-----------------------
 src/bin/bind10/tests/bind10_test.py.in |   33 +++++-------
 2 files changed, 39 insertions(+), 83 deletions(-)

-----------------------------------------------------------------------

diff --git a/src/bin/bind10/bind10_src.py.in b/src/bin/bind10/bind10_src.py.in
index 6fe3693..71fd0be 100755
--- a/src/bin/bind10/bind10_src.py.in
+++ b/src/bin/bind10/bind10_src.py.in
@@ -249,12 +249,12 @@ class BoB:
         self.started_auth_family = False
         self.started_resolver_family = False
         self.curproc = None
+        # XXX: Not used now, waits for reintroduction of restarts.
         self.dead_processes = {}
         self.msgq_socket_file = msgq_socket_file
         self.nocache = nocache
         self.component_config = {}
         self.processes = {}
-        self.expected_shutdowns = {}
         self.runnable = False
         self.uid = setuid
         self.username = username
@@ -373,7 +373,7 @@ class BoB:
         pids.sort()
         process_list = [ ]
         for pid in pids:
-            process_list.append([pid, self.processes[pid].name])
+            process_list.append([pid, self.processes[pid].name()])
         return process_list
 
     def _get_stats_data(self):
@@ -433,8 +433,8 @@ class BoB:
         self.stop_creator(True)
 
         for pid in self.processes:
-            logger.info(BIND10_KILL_PROCESS, self.processes[pid].name)
-            self.processes[pid].process.kill()
+            logger.info(BIND10_KILL_PROCESS, self.processes[pid].name())
+            self.processes[pid].kill(True)
         self.processes = {}
 
     def read_bind10_config(self):
@@ -608,8 +608,6 @@ class BoB:
         self.log_starting(name, port, address)
         newproc = ProcessInfo(name, args, c_channel_env)
         newproc.spawn()
-        # This is now done in register_process()
-        #self.processes[newproc.pid] = newproc
         self.log_started(newproc.pid)
         return newproc
 
@@ -618,10 +616,7 @@ class BoB:
         Put another process into boss to watch over it.  When the process
         dies, the info.failed() is called with the exit code.
         """
-        self.processes[pid] = info._procinfo
-        if info._procinfo is None:
-            # XXX: a short term hack.  This is the sockcreator.
-            self.sockcreator = info._SockCreator__creator
+        self.processes[pid] = info
 
     def start_simple(self, name):
         """
@@ -830,10 +825,6 @@ class BoB:
         (in logs, etc), the recipient is the address on msgq.
         """
         logger.info(BIND10_STOP_PROCESS, process)
-        # TODO: Some timeout to solve processes that don't want to die would
-        # help. We can even store it in the dict, it is used only as a set
-        self.expected_shutdowns[process] = 1
-        # Ask the process to die willingly
         self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
             recipient)
 
@@ -885,12 +876,11 @@ class BoB:
         time.sleep(1)
         self.reap_children()
         # next try sending a SIGTERM
-        processes_to_stop = list(self.processes.values())
-        for proc_info in processes_to_stop:
-            logger.info(BIND10_SEND_SIGTERM, proc_info.name,
-                        proc_info.pid)
+        components_to_stop = list(self.processes.values())
+        for component in components_to_stop:
+            logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
             try:
-                proc_info.process.terminate()
+                component.kill()
             except OSError:
                 # ignore these (usually ESRCH because the child
                 # finally exited)
@@ -900,12 +890,12 @@ class BoB:
             # XXX: some delay probably useful... how much is uncertain
             time.sleep(0.1)  
             self.reap_children()
-            processes_to_stop = list(self.processes.values())
-            for proc_info in processes_to_stop:
-                logger.info(BIND10_SEND_SIGKILL, proc_info.name,
-                            proc_info.pid)
+            components_to_stop = list(self.processes.values())
+            for component in components_to_stop:
+                logger.info(BIND10_SEND_SIGKILL, component.name(),
+                            component.pid())
                 try:
-                    proc_info.process.kill()
+                    component.kill(True)
                 except OSError:
                     # ignore these (usually ESRCH because the child
                     # finally exited)
@@ -927,43 +917,14 @@ class BoB:
                 # XXX: should be impossible to get any other error here
                 raise
             if pid == 0: break
-            if self.sockcreator is not None and self.sockcreator.pid() == pid:
-                # This is the socket creator, started and terminated
-                # differently. This can't be restarted.
-                if self.runnable:
-                    logger.fatal(BIND10_SOCKCREATOR_CRASHED)
-                    self.sockcreator = None
-                    self.runnable = False
-                # This was inserted in self.processes by register_process.
-                # Now need to remove it.
-                del self.processes[pid]
-            elif pid in self.processes:
+            if pid in self.processes:
                 # One of the processes we know about.  Get information on it.
-                proc_info = self.processes.pop(pid)
-                proc_info.restart_schedule.set_run_stop_time()
-                self.dead_processes[proc_info.pid] = proc_info
-
-                # Write out message, but only if in the running state:
-                # During startup and shutdown, these messages are handled
-                # elsewhere.
-                if self.runnable:
-                    if exit_status is None:
-                        logger.warn(BIND10_PROCESS_ENDED_NO_EXIT_STATUS,
-                                    proc_info.name, proc_info.pid)
-                    else:
-                        logger.warn(BIND10_PROCESS_ENDED_WITH_EXIT_STATUS,
-                                    proc_info.name, proc_info.pid,
-                                    exit_status)
-
-                    # Was it a special process?
-                    if proc_info.name == "b10-msgq":
-                        logger.fatal(BIND10_MSGQ_DAEMON_ENDED)
-                        self.runnable = False
-
-                # If we're in 'brittle' mode, we want to shutdown after
-                # any process dies.
-                if self.brittle:
-                    self.runnable = False
+                component = self.processes.pop(pid)
+                if component.running() and self.runnable:
+                    # Tell it it failed. But only if it matters (we are
+                    # not shutting down and the component considers itself
+                    # to be running.
+                    component.failed(exit_status);
             else:
                 logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
 
@@ -986,10 +947,6 @@ class BoB:
         still_dead = {}
         now = time.time()
         for proc_info in self.dead_processes.values():
-            if proc_info.name in self.expected_shutdowns:
-                # We don't restart, we wanted it to die
-                del self.expected_shutdowns[proc_info.name]
-                continue
             restart_time = proc_info.restart_schedule.get_restart_time(now)
             if restart_time > now:
                 if (next_restart is None) or (next_restart > restart_time):
@@ -1191,6 +1148,10 @@ def main():
     while boss_of_bind.runnable:
         # clean up any processes that exited
         boss_of_bind.reap_children()
+        # XXX: As we don't put anything into the processes to be restarted,
+        # this is really a complicated NOP. But we will try to reintroduce
+        # delayed restarts, so it stays here for now, until we find out if
+        # it's useful.
         next_restart = boss_of_bind.restart_processes()
         if next_restart is None:
             wait_time = None
diff --git a/src/bin/bind10/tests/bind10_test.py.in b/src/bin/bind10/tests/bind10_test.py.in
index 85a949a..37b4ab4 100644
--- a/src/bin/bind10/tests/bind10_test.py.in
+++ b/src/bin/bind10/tests/bind10_test.py.in
@@ -467,14 +467,8 @@ class TestStartStopProcessesBob(unittest.TestCase):
         """
         Check if proper combinations of DHCPv4 and DHCpv6 can be started
         """
-        v4found = 0
-        v6found = 0
-
-        for pid in bob.processes:
-            if (bob.processes[pid].name == "b10-dhcp4"):
-                v4found += 1
-            if (bob.processes[pid].name == "b10-dhcp6"):
-                v6found += 1
+        v4found = 'b10-dhcp4' in bob.component_config
+        v6found = 'b10-dhcp6' in bob.component_config
 
         # there should be exactly one DHCPv4 daemon (if v4==True)
         # there should be exactly one DHCPv6 daemon (if v6==True)
@@ -690,6 +684,12 @@ class TestStartStopProcessesBob(unittest.TestCase):
         #bob.cfg_start_dhcp4 = True
         #self.check_started_dhcp(bob, True, True)
 
+class MockComponent:
+    def __init__(self, name, pid):
+        self.name = lambda: name
+        self.pid = lambda: pid
+
+
 class TestBossCmd(unittest.TestCase):
     def test_ping(self):
         """
@@ -712,18 +712,11 @@ class TestBossCmd(unittest.TestCase):
         Confirm getting a list of processes works.
         """
         bob = MockBob()
-        bob.start_all_processes()
+        bob.register_process(1, MockComponent('first', 1))
+        bob.register_process(2, MockComponent('second', 2))
         answer = bob.command_handler("show_processes", None)
-        processes = [[1, 'b10-sockcreator'],
-                     [2, 'b10-msgq'],
-                     [3, 'b10-cfgmgr'], 
-                     [5, 'b10-auth'],
-                     [7, 'b10-xfrout'],
-                     [8, 'b10-xfrin'], 
-                     [9, 'b10-zonemgr'],
-                     [10, 'b10-stats'], 
-                     [11, 'b10-stats-httpd'], 
-                     [12, 'b10-cmdctl']]
+        processes = [[1, 'first'],
+                     [2, 'second']]
         self.assertEqual(answer, {'result': [0, processes]})
 
 class TestParseArgs(unittest.TestCase):
@@ -833,6 +826,8 @@ class TestPIDFile(unittest.TestCase):
         self.assertRaises(IOError, dump_pid,
                           'nonexistent_dir' + os.sep + 'bind10.pid')
 
+# TODO: Do we want brittle mode? Probably yes. So we need to re-enable to after that.
+ at unittest.skip("Brittle mode temporarily broken")
 class TestBrittle(unittest.TestCase):
     def test_brittle_disabled(self):
         bob = MockBob()