BIND 10 exp/res-research, updated. a29ce7e1d04d4ff9c6cd9694b52cb16ed03d27e3 [res-research] supported 'override' mode in replay; supported dumping TTL stat

Mon Jul 16 08:17:56 UTC 2012

The branch, exp/res-research has been updated
       via  a29ce7e1d04d4ff9c6cd9694b52cb16ed03d27e3 (commit)
       via  187274901df02b6f428ef0802f1bb50372285ac0 (commit)
       via  a4e94f12217c0fdf4a38e29cb78837a5b1cfcc47 (commit)
       via  27b5f4470bd80be6ccf31ee3059fccd1ca05a1a1 (commit)
       via  8fdc766b49114c1d3f8f7712a4c920a0cd16f3b8 (commit)
       via  ed7cc948995b06b510264075d8e2c092f2603436 (commit)
       via  344539f99e3e7e71bf647a598714b86809b2f0a0 (commit)
       via  a3be78760086cc8dc21d226a1d47dc4ed887933d (commit)
       via  ff19614b834768d0cb1fa6d3898a87e49091addc (commit)
       via  01d6791150ed51a015f22dd61b6bcd9b08179d6d (commit)
       via  63c3d103f8adaa21812093ed2a286bbdb2cea729 (commit)
      from  8815570e3dad3076b298c4226bdebd49d7031f34 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit a29ce7e1d04d4ff9c6cd9694b52cb16ed03d27e3
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Mon Jul 16 01:16:57 2012 -0700

    [res-research] supported 'override' mode in replay; supported dumping TTL stat

commit 187274901df02b6f428ef0802f1bb50372285ac0
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Mon Jul 16 01:15:05 2012 -0700

    [res-research] try to collect even more glue info

commit a4e94f12217c0fdf4a38e29cb78837a5b1cfcc47
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Mon Jul 16 01:14:15 2012 -0700

    [res-research] supported dumping TTL stat; allow re-expiring cache entry.

commit 27b5f4470bd80be6ccf31ee3059fccd1ca05a1a1
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Sat Jul 14 21:54:13 2012 -0700

    [res-research] dump format fix

commit 8fdc766b49114c1d3f8f7712a4c920a0cd16f3b8
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Sat Jul 14 17:27:31 2012 -0700

    [res-research] make parent zone detection more accurate

commit ed7cc948995b06b510264075d8e2c092f2603436
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Sat Jul 14 14:20:46 2012 -0700

    [res-research] covered more corner cases, supported stat on ext queries.

commit 344539f99e3e7e71bf647a598714b86809b2f0a0
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Sat Jul 14 14:17:11 2012 -0700

    [res-research] fetch more NS addresses for completeness

commit a3be78760086cc8dc21d226a1d47dc4ed887933d
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Sat Jul 14 14:16:11 2012 -0700

    [res-research] added interactive view mode to dns_cache

commit ff19614b834768d0cb1fa6d3898a87e49091addc
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Fri Jul 13 22:49:15 2012 -0700

    [res-research] fixed some regression

commit 01d6791150ed51a015f22dd61b6bcd9b08179d6d
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Fri Jul 13 16:54:03 2012 -0700

    [res-research] cache (dump/(de)serialize) response type information

commit 63c3d103f8adaa21812093ed2a286bbdb2cea729
Author: JINMEI Tatuya <jinmei at isc.org>
Date:   Fri Jul 13 16:31:36 2012 -0700

    [res-research] collected/dump detailed statistics in the resolver

-----------------------------------------------------------------------

Summary of changes:
 exp/res-research/analysis/dns_cache.py     |  166 +++++++++++--
 exp/res-research/analysis/mini_resolver.py |  279 ++++++++++++++++++---
 exp/res-research/analysis/query_replay.py  |  371 +++++++++++++++++++++++-----
 3 files changed, 690 insertions(+), 126 deletions(-)

-----------------------------------------------------------------------

diff --git a/exp/res-research/analysis/dns_cache.py b/exp/res-research/analysis/dns_cache.py
index 93278d4..02cefe8 100755
--- a/exp/res-research/analysis/dns_cache.py
+++ b/exp/res-research/analysis/dns_cache.py
@@ -16,8 +16,15 @@
 # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 from isc.dns import *
+import cmd
 from optparse import OptionParser
 import struct
+import sys
+try:
+    # optional module
+    import readline
+except ImportError:
+    pass
 
 # "root hint"
 ROOT_SERVERS = [pfx + '.root-servers.net' for pfx in 'abcdefghijklm']
@@ -63,16 +70,19 @@ class CacheEntry:
     trust (SimpleDNSCache.TRUST_xxx) The trust level of the cache entry.
     msglen (int) The size of the DNS response message from which the cache
       entry comes; it's 0 if it's not a result of a DNS message.
+    resp_type (RESP_xxx below): The type of the DNS response message from
+      which the cache entry comes.
     rcode (int) Numeric form of corresponding RCODE (converted to int as it's
       more memory efficient).
 
     '''
 
-    def __init__(self, ttl, rdata_list, trust, msglen, rcode, id):
+    def __init__(self, ttl, rdata_list, trust, respinfo, rcode, id):
         self.ttl = ttl
         self.rdata_list = rdata_list
         self.trust = trust
-        self.msglen = msglen
+        self.msglen = respinfo[0]
+        self.resp_type = respinfo[1]
         self.rcode = rcode.get_code()
         self.id = id
         self.time_updated = None # timestamp of 'creation' or 'update'
@@ -82,6 +92,7 @@ class CacheEntry:
         self.rdata_list = other.rdata_list
         self.trust = other.trust
         self.msglen = other.msglen
+        self.resp_type = other.resp_type
         self.rcode = other.rcode
         self.id = other.id
 
@@ -115,13 +126,30 @@ class SimpleDNSCache:
     TRUST_GLUE = 3              # referral or glue
     TRUST_AUTHADDITIONAL = 4    # additional section records in auth answer
 
-
     # Search options, can be logically OR'ed.
     FIND_DEFAULT = 0
     FIND_ALLOW_NEGATIVE = 1
     FIND_ALLOW_NOANSWER = 2
     FIND_ALLOW_CNAME = 4
 
+    # Response types (when applicable)
+    RESP_FINAL_ANSWER_NONE = 0  # dummy type, for entry not assoc w/ response.
+    RESP_FINAL_ANSWER_COMPRESSED = 1
+    RESP_FINAL_ANSWER_UNCOMPRESSED = 2
+    RESP_CNAME_ANSWER_COMPRESSED = 3
+    RESP_CNAME_ANSWER_UNCOMPRESSED = 4
+    RESP_ANSWER_UNEXPECTED = 5
+    RESP_NXDOMAIN_SOA = 6
+    RESP_NXDOMAIN_NOAUTH = 7
+    RESP_NXDOMAIN_UNEXPECTED = 8
+    RESP_NXRRSET_SOA = 9
+    RESP_NXRRSET_NOAUTH = 10
+    RESP_NXRRSET_UNEXPECTED = 11
+    RESP_REFERRAL_WITHGLUE = 12
+    RESP_REFERRAL_WITHOUTGLUE = 13
+    RESP_REFERRAL_UNEXPECTED = 14
+    RESP_UNEXPECTED = 15
+
     def __init__(self):
         # top level cache table
         self.__table = {}
@@ -229,26 +257,40 @@ class SimpleDNSCache:
 
         If not, update its update/creation time to "now".
         Return True if the timestamp is updated; False otherwise.
+        If now is None, the entry will become expired, regardless of the
+        current status.
 
         '''
         entry = self.__entries[entry_id]
+
+        if now is None:
+            entry.time_updated = None
+            return True         # return value doesn't matter
+
         if entry.is_expired(now):
             entry.time_updated = now
             return True
         return False
 
-    def add(self, rrset, trust=TRUST_LOCAL, msglen=0, rcode=Rcode.NOERROR()):
+    def add(self, rrset, trust=TRUST_LOCAL, respinfo=None,
+            rcode=Rcode.NOERROR()):
         '''Add a new cache item.
 
+        respinfo (int, RESP_xxx): if not None, the tuple gives additional
+          information of the response that produced this entry.
+          respinfo[0] is the size of the response.
+
         Note: this cache always handles cached data per RR type; even if
         NXDOMAIN is type independent, it's still specific to the associated
         type within this cache.
 
         '''
+        if respinfo is None:
+            respinfo = (0, self.RESP_FINAL_ANSWER_NONE)
         key = (rrset.get_name(), rrset.get_class())
         new_entry = self.__create_cache_entry(rrset.get_ttl().get_value(),
-                                              rrset.get_rdata(), trust, msglen,
-                                              rcode)
+                                              rrset.get_rdata(), trust,
+                                              respinfo, rcode)
         if not key in self.__table:
             self.__table[key] = {rrset.get_type(): [new_entry]}
             new_entry._table_entry = self.__table[key]
@@ -260,8 +302,8 @@ class SimpleDNSCache:
             else:
                 self.__insert_cache_entry(cur_entries, new_entry)
 
-    def __create_cache_entry(self, ttl, rdata_list, trust, msglen, rcode):
-        new_entry = CacheEntry(ttl, rdata_list, trust, msglen, rcode,
+    def __create_cache_entry(self, ttl, rdata_list, trust, respinfo, rcode):
+        new_entry = CacheEntry(ttl, rdata_list, trust, respinfo, rcode,
                                self.__counter)
         self.__entries[self.__counter] = new_entry
         self.__counter += 1
@@ -293,25 +335,51 @@ class SimpleDNSCache:
         with open(db_file, 'br') as f:
             self.__deserialize(f)
 
+    def dump_name_entry(self, f, name, rrclass):
+        '''Dump cache entry for the specific name, RRType in text.'''
+        entry = self.__table.get((name, rrclass))
+        if entry is not None:
+            self.__dump_table_entry(f, name, rrclass, entry)
+
     def __dump_text(self, f):
         for key, entry in self.__table.items():
             name = key[0]
             rrclass = key[1]
-            rdata_map = entry
-            for rrtype, entries in rdata_map.items():
+            self.__dump_table_entry(f, name, rrclass, entry)
+
+    def __dump_table_entry(self, f, name, rrclass, entry):
+        rdata_map = entry
+        for rrtype, entries in rdata_map.items():
+            for entry in entries:
+                if len(entry.rdata_list) == 0:
+                    f.write((';; [%s, TTL=%d, msglen=%d, resptype=%d] ' +
+                             '%s/%s/%s\n') %
+                            (Rcode(entry.rcode), entry.ttl,
+                             entry.msglen, entry.resp_type, name, rrclass,
+                             rrtype))
+                else:
+                    f.write(';; [msglen=%d, resptype=%d, trust=%d]\n' %
+                            (entry.msglen, entry.resp_type, entry.trust))
+                    rrset = RRset(name, rrclass, rrtype, RRTTL(entry.ttl))
+                    for rdata in entry.rdata_list:
+                        rrset.add_rdata(rdata)
+                    f.write(rrset.to_text())
+
+    def dump_ttl_stat(self, f, used_only=True):
+        total_stat = {}         # TTL => counter
+        for rdata_map in self.__table.values():
+            for entries in rdata_map.values():
                 for entry in entries:
-                    if len(entry.rdata_list) == 0:
-                        f.write(';; [%s, TTL=%d, msglen=%d] %s/%s/%s\n' %
-                                (str(Rcode(entry.rcode)), entry.ttl,
-                                 entry.msglen, str(name), str(rrclass),
-                                 str(rrtype)))
-                    else:
-                        f.write(';; [msglen=%d, trust=%d]\n' %
-                                (entry.msglen, entry.trust))
-                        rrset = RRset(name, rrclass, rrtype, RRTTL(entry.ttl))
-                        for rdata in entry.rdata_list:
-                            rrset.add_rdata(rdata)
-                        f.write(rrset.to_text())
+                    if used_only and entry.time_updated is None:
+                        continue
+
+                    if not entry.ttl in total_stat:
+                        total_stat[entry.ttl] = 0
+                    total_stat[entry.ttl] += 1
+        ttl_list = list(total_stat.keys())
+        ttl_list.sort()
+        for ttl in ttl_list:
+            f.write('%d,%d\n' % (ttl, total_stat[ttl]))
 
     def __serialize(self, f):
         '''Dump cache database content to a file in serialized binary format.
@@ -329,6 +397,7 @@ class SimpleDNSCache:
               <RCODE value, 1 byte>
               <TTL, 4 bytes>
               <msglen, 2 bytes>
+              <response type, 1 byte>
               <trust, 1 byte>
               <# of RDATAs, 2 bytes>
               sequence of RDATA, each of which is:
@@ -353,6 +422,7 @@ class SimpleDNSCache:
                     data += struct.pack('B', entry.rcode)
                     data += struct.pack('I', entry.ttl)
                     data += struct.pack('H', entry.msglen)
+                    data += struct.pack('B', entry.resp_type)
                     data += struct.pack('B', entry.trust)
                     data += struct.pack('H', len(entry.rdata_list))
                     for rdata in entry.rdata_list:
@@ -389,6 +459,7 @@ class SimpleDNSCache:
                     rcode = struct.unpack('B', f.read(1))[0]
                     ttl = struct.unpack('I', f.read(4))[0]
                     msglen = struct.unpack('H', f.read(2))[0]
+                    resp_type = struct.unpack('B', f.read(1))[0]
                     trust = struct.unpack('B', f.read(1))[0]
                     n_rdata = struct.unpack('H', f.read(2))[0]
                     rdata_list = []
@@ -398,22 +469,69 @@ class SimpleDNSCache:
                         rdata_list.append(Rdata(rrtype, rrclass,
                                                 f.read(rdata_len)))
                     entry = self.__create_cache_entry(ttl, rdata_list, trust,
-                                                      msglen, Rcode(rcode))
+                                                      (msglen, resp_type),
+                                                      Rcode(rcode))
                     entries.append(entry)
                 entries.sort(key=lambda x: x.trust)
                 self.__table[key][rrtype] = entries
 
+SimpleDNSCache.RESP_DESCRIPTION = {
+    SimpleDNSCache.RESP_FINAL_ANSWER_COMPRESSED: 'answer compressed',
+    SimpleDNSCache.RESP_FINAL_ANSWER_UNCOMPRESSED: 'answer uncompressed',
+    SimpleDNSCache.RESP_CNAME_ANSWER_COMPRESSED: 'CNAME compressed',
+    SimpleDNSCache.RESP_CNAME_ANSWER_UNCOMPRESSED: 'CNAME uncompressed',
+    SimpleDNSCache.RESP_ANSWER_UNEXPECTED: 'answer, uncommon type',
+    SimpleDNSCache.RESP_NXDOMAIN_SOA: 'NXDOMAIN with SOA',
+    SimpleDNSCache.RESP_NXDOMAIN_NOAUTH:
+        'NXDOMAIN with empty auth section',
+    SimpleDNSCache.RESP_NXDOMAIN_UNEXPECTED: 'NXDOMAIN, uncommon type',
+    SimpleDNSCache.RESP_NXRRSET_SOA: 'NXRRSET with SOA',
+    SimpleDNSCache.RESP_NXRRSET_NOAUTH: 'NXRRSET with empty auth section',
+    SimpleDNSCache.RESP_NXRRSET_UNEXPECTED: 'NXRRSET, uncommon type',
+    SimpleDNSCache.RESP_REFERRAL_WITHGLUE:
+        'referral with "in-bailiwick" glue',
+    SimpleDNSCache.RESP_REFERRAL_WITHOUTGLUE:
+        'referral without "in-bailiwick" glue',
+    SimpleDNSCache.RESP_REFERRAL_UNEXPECTED: 'referral, uncommon type',
+    SimpleDNSCache.RESP_UNEXPECTED: 'uncommon response'
+    }
+
 def get_option_parser():
     parser = OptionParser(usage='usage: %prog [options] cache_db_file')
     parser.add_option("-f", "--dump-file", dest="dump_file", action="store",
-                      default=None,
                       help="if specified, file name to dump the cache " + \
                           "content in text format")
+    parser.add_option("-i", "--interactive", dest="interactive",
+                      action="store_true", default=False,
+                      help="interactive mode to peek into the cache content")
     return parser
 
+class CacheShell(cmd.Cmd):
+    prompt = '> '
+
+    def __init__(self, cache):
+        cmd.Cmd.__init__(self)
+        self.__cache = cache
+        self.__rrclass = RRClass.IN()
+
+    def do_find(self, arg):
+        name = Name(arg)
+        self.__cache.dump_name_entry(sys.stdout, name, self.__rrclass)
+
+    def do_setclass(self, arg):
+        self.__rrclass = RRClass(arg)
+
+    def do_exit(self, arg):
+        return True
+
 def run(db_file, options):
     cache = SimpleDNSCache()
+    sys.stdout.write('Loading cache...')
+    sys.stdout.flush()
     cache.load(db_file)
+    sys.stdout.write('done\n')
+    if options.interactive:
+        CacheShell(cache).cmdloop()
     if options.dump_file is not None:
         cache.dump(options.dump_file)
 
diff --git a/exp/res-research/analysis/mini_resolver.py b/exp/res-research/analysis/mini_resolver.py
index 6212f36..685ac37 100755
--- a/exp/res-research/analysis/mini_resolver.py
+++ b/exp/res-research/analysis/mini_resolver.py
@@ -16,6 +16,7 @@
 # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 from isc.dns import *
+import dns_cache
 from dns_cache import SimpleDNSCache, install_root_hint
 import datetime
 import errno
@@ -64,6 +65,30 @@ class ResQuery:
         self.expire = time.time() + self.QUERY_TIMEOUT
         self.timer = None       # will be set when timer is associated
 
+class ResolverStatistics:
+    def __init__(self):
+        self.query_timeout = 0
+        self.response_broken = 0
+        self.response_truncated = 0
+        self.__response_stat = {} # SimpleDNSCache.RESP_xxx => counter
+        for type in SimpleDNSCache.RESP_DESCRIPTION.keys():
+            self.__response_stat[type] = 0
+
+    def update_response_stat(self, type):
+        self.__response_stat[type] += 1
+
+    def dump(self, f):
+        f.write('Response statistics:\n')
+        f.write('  query timeout: %d\n' % self.query_timeout)
+        f.write('  broken responses: %d\n' % self.response_broken)
+        f.write('  truncated responses: %d\n' % self.response_truncated)
+
+        descriptions = list(SimpleDNSCache.RESP_DESCRIPTION.keys())
+        descriptions.sort()
+        for type in descriptions:
+            f.write('  %s: %d\n' % (SimpleDNSCache.RESP_DESCRIPTION[type],
+                                    self.__response_stat[type]))
+
 class ResolverContext:
     CNAME_LOOP_MAX = 15
     FETCH_DEPTH_MAX = 8
@@ -71,7 +96,7 @@ class ResolverContext:
     SERVFAIL_TTL = 1800 # cache TTL for 'SERVFAIL' results.  BIND9's lame-ttl.
 
     def __init__(self, sock4, sock6, renderer, qname, qclass, qtype, cache,
-                 query_table, nest=0):
+                 query_table, stat, nest=0):
         self.__sock4 = sock4
         self.__sock6 = sock6
         self.__msg = Message(Message.RENDER)
@@ -84,10 +109,12 @@ class ResolverContext:
         self.__nest = nest      # CNAME loop prevention
         self.__debug_level = LOGLVL_INFO
         self.__fetch_queries = set()
+        self.__aux_fetch_queries = set()
         self.__parent = None    # set for internal fetch contexts
         self.__cur_zone = None
         self.__cur_ns_addr = None
         self.__qtable = query_table
+        self.__stat = stat
 
     def set_debug_level(self, level):
         self.__debug_level = level
@@ -111,7 +138,7 @@ class ResolverContext:
                                [postfix]))
 
     def get_aux_queries(self):
-        return list(self.__fetch_queries)
+        return list(self.__fetch_queries) + list(self.__aux_fetch_queries)
 
     def __create_query(self):
         '''Create a template query.  QID will be filled on send.'''
@@ -147,41 +174,53 @@ class ResolverContext:
         self.dprint(LOGLVL_DEBUG1, 'no reachable server')
         fail_rrset = RRset(self.__qname, self.__qclass, self.__qtype,
                            RRTTL(self.SERVFAIL_TTL))
-        self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER, 0,
+        self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER,
+                         (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE),
                          Rcode.SERVFAIL())
         return self.__resume_parents()
 
-    def handle_response(self, resp_msg, msglen):
+    def handle_response(self, resp_msg, resp_data):
         next_qry = None
+        msglen = len(resp_data)
         try:
             if not resp_msg.get_header_flag(Message.HEADERFLAG_QR):
                 self.dprint(LOGLVL_INFO,
                             'received query when expecting a response')
+                self.__stat.response_broken += 1
                 raise InternalLame('lame server')
             if resp_msg.get_rr_count(Message.SECTION_QUESTION) != 1:
                 self.dprint(LOGLVL_INFO,
                             'unexpected # of question in response: %s',
                             [resp_msg.get_rr_count(Message.SECTION_QUESTION)])
+                self.__stat.response_broken += 1
                 raise InternalLame('lame server')
             question = resp_msg.get_question()[0]
-            if question.get_name() != self.__qname or \
-                    question.get_class() != self.__qclass or \
-                    question.get_type() != self.__qtype:
+            if (question.get_name() != self.__qname or
+                question.get_class() != self.__qclass or
+                question.get_type() != self.__qtype):
                 self.dprint(LOGLVL_INFO, 'unexpected response: ' +
                             'query mismatch actual=%s/%s/%s',
                             [question.get_name(), question.get_class(),
                              question.get_type()])
+                self.__stat.response_broken += 1
                 raise InternalLame('lame server')
             if resp_msg.get_qid() != self.__qid:
                 self.dprint(LOGLVL_INFO, 'unexpected response: '
                             'QID mismatch; expected=%s, actual=%s',
                             [self.__qid, resp_msg.get_qid()])
+                self.__stat.response_broken += 1
                 raise InternalLame('lame server')
 
+            resp_type = self.__get_response_type(resp_msg, resp_data)
+            self.__stat.update_response_stat(resp_type)
+            if resp_msg.get_header_flag(Message.HEADERFLAG_TC):
+                self.__stat.response_truncated += 1
+            respinfo = (msglen, resp_type)
+
             # Look into the response
             if (resp_msg.get_header_flag(Message.HEADERFLAG_AA) or
                 self.__is_cname_response(resp_msg)):
-                next_qry = self.__handle_auth_answer(resp_msg, msglen)
+                next_qry = self.__handle_auth_answer(resp_msg, respinfo)
                 self.__handle_auth_othersections(resp_msg)
             elif (resp_msg.get_rr_count(Message.SECTION_ANSWER) == 0 and
                   resp_msg.get_rr_count(Message.SECTION_AUTHORITY) == 0 and
@@ -189,7 +228,7 @@ class ResolverContext:
                    resp_msg.get_rcode() == Rcode.NXDOMAIN())):
                 # Some servers return a negative response without setting AA.
                 # (Leave next_qry None)
-                self.__handle_negative_answer(resp_msg, msglen)
+                self.__handle_negative_answer(resp_msg, respinfo)
             elif (resp_msg.get_rcode() == Rcode.NOERROR() and
                   not resp_msg.get_header_flag(Message.HEADERFLAG_AA)):
                 authorities = resp_msg.get_section(Message.SECTION_AUTHORITY)
@@ -203,7 +242,7 @@ class ResolverContext:
                             raise InternalLame('lame server: ' +
                                                'delegation not for subdomain')
                         ns_addr = self.__handle_referral(resp_msg, ns_rrset,
-                                                         msglen)
+                                                         respinfo)
                         if ns_addr is not None:
                             next_qry = ResQuery(self, self.__qid, ns_addr)
                         elif len(self.__fetch_queries) == 0:
@@ -223,12 +262,95 @@ class ResolverContext:
                 self.dprint(LOGLVL_DEBUG1, 'no usable server')
                 fail_rrset = RRset(self.__qname, self.__qclass, self.__qtype,
                                    RRTTL(self.SERVFAIL_TTL))
-                self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER, 0,
+                self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER,
+                                 (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE),
                                  Rcode.SERVFAIL())
         if next_qry is None:
              next_qry = self.__resume_parents()
         return next_qry
 
+    def __get_response_type(self, resp_msg, resp_data):
+        if (resp_msg.get_header_flag(Message.HEADERFLAG_AA) and
+            resp_msg.get_rcode() == Rcode.NOERROR() and
+            resp_msg.get_rr_count(Message.SECTION_ANSWER) > 0):
+            answer = resp_msg.get_section(Message.SECTION_ANSWER)[0]
+            # typical positive answer
+            is_cname = (self.__qtype != RRType.CNAME() and
+                        self.__qtype != RRType.ANY() and
+                        answer.get_type() == RRType.CNAME())
+            is_final = (answer.get_name() == self.__qname)
+            offset_to_answer = 12 + self.__qname.get_length() + 4
+            is_final_compressed = (is_final and
+                                   resp_data[offset_to_answer] == 0xc0 and
+                                   resp_data[offset_to_answer + 1] == 12)
+            if is_final_compressed and not is_cname:
+                return SimpleDNSCache.RESP_FINAL_ANSWER_COMPRESSED
+            elif is_final_compressed and is_cname:
+                return SimpleDNSCache.RESP_CNAME_ANSWER_COMPRESSED
+            elif is_final and not is_cname:
+                return SimpleDNSCache.RESP_FINAL_ANSWER_UNCOMPRESSED
+            elif is_final and is_cname:
+                return SimpleDNSCache.RESP_CNAME_ANSWER_UNCOMPRESSED
+            else:
+                return SimpleDNSCache.RESP_ANSWER_UNEXPECTED
+        elif (resp_msg.get_header_flag(Message.HEADERFLAG_AA) and
+              resp_msg.get_rcode() == Rcode.NXDOMAIN() and
+              resp_msg.get_rr_count(Message.SECTION_ANSWER) == 0):
+            # typical NXDOMAIN answer
+            if resp_msg.get_rr_count(Message.SECTION_AUTHORITY) > 0:
+                auth = resp_msg.get_section(Message.SECTION_AUTHORITY)[0]
+                if auth.get_type() == RRType.SOA():
+                    return SimpleDNSCache.RESP_NXDOMAIN_SOA
+            elif resp_msg.get_rr_count(Message.SECTION_AUTHORITY) == 0:
+                return SimpleDNSCache.RESP_NXDOMAIN_NOAUTH
+            return SimpleDNSCache.RESP_NXDOMAIN_UNEXPECTED
+        elif (resp_msg.get_header_flag(Message.HEADERFLAG_AA) and
+              resp_msg.get_rcode() == Rcode.NOERROR() and
+              resp_msg.get_rr_count(Message.SECTION_ANSWER) == 0):
+            # typical NXRRSET answer
+            if resp_msg.get_rr_count(Message.SECTION_AUTHORITY) > 0:
+                auth = resp_msg.get_section(Message.SECTION_AUTHORITY)[0]
+                if auth.get_type() == RRType.SOA():
+                    return SimpleDNSCache.RESP_NXRRSET_SOA
+            elif resp_msg.get_rr_count(Message.SECTION_AUTHORITY) == 0:
+                return SimpleDNSCache.RESP_NXRRSET_NOAUTH
+            return SimpleDNSCache.RESP_NXRRSET_UNEXPECTED
+        elif (not resp_msg.get_header_flag(Message.HEADERFLAG_AA) and
+              resp_msg.get_rcode() == Rcode.NOERROR() and
+              resp_msg.get_rr_count(Message.SECTION_ANSWER) == 0 and
+              resp_msg.get_rr_count(Message.SECTION_AUTHORITY) > 0):
+            # typical delegation
+            auth = resp_msg.get_section(Message.SECTION_AUTHORITY)[0]
+            if auth.get_type() == RRType.NS():
+                ns_name =  auth.get_name()
+                cmp_reln = ns_name.compare(self.__cur_zone).get_relation()
+                if cmp_reln == NameComparisonResult.SUBDOMAIN:
+                    # valid delegation
+                    if self.__check_bailiwick_glue(resp_msg, auth):
+                        return SimpleDNSCache.RESP_REFERRAL_WITHGLUE
+                    else:
+                        return SimpleDNSCache.RESP_REFERRAL_WITHOUTGLUE
+            return SimpleDNSCache.RESP_REFERRAL_UNEXPECTED
+        else:
+            # last resort: unexpected or uncommon
+            return SimpleDNSCache.RESP_UNEXPECTED
+
+    def __check_bailiwick_glue(self, resp_msg, ns_rrset):
+        '''Check if the message contains "in-bailiwick" glue in additional.'''
+        ns_names = [Name(ns.to_text()) for ns in ns_rrset.get_rdata()]
+        for ad_rrset in resp_msg.get_section(Message.SECTION_ADDITIONAL):
+            if (ad_rrset.get_type() != RRType.A() and
+                ad_rrset.get_type() != RRType.AAAA()):
+                continue
+            ad_name = ad_rrset.get_name()
+            for ns_name in ns_names:
+                if ns_name == ad_name:
+                    cmp_reln = self.__cur_zone.compare(ad_name).get_relation()
+                    if (cmp_reln == NameComparisonResult.EQUAL or
+                        cmp_reln == NameComparisonResult.SUPERDOMAIN):
+                        return True
+        return False
+
     def __is_cname_response(self, resp_msg):
         # From BIND 9: A BIND8 server could return a non-authoritative
         # answer when a CNAME is followed.  We should treat it as a valid
@@ -242,7 +364,7 @@ class ResolverContext:
             return True
         return False
 
-    def __handle_auth_answer(self, resp_msg, msglen):
+    def __handle_auth_answer(self, resp_msg, respinfo):
         '''Subroutine of handle_response, handling an authoritative answer.'''
         if (resp_msg.get_rcode() == Rcode.NOERROR() or
             resp_msg.get_rcode() == Rcode.NXDOMAIN()) and \
@@ -254,8 +376,11 @@ class ResolverContext:
                     answer_rrset.get_class() == self.__qclass):
                     if any_query or answer_rrset.get_type() == self.__qtype:
                         found = True
+                        if answer_rrset.get_type() == RRType.NS():
+                            self.__check_ns_consistency(answer_rrset)
                         self.__cache.add(answer_rrset,
-                                         SimpleDNSCache.TRUST_ANSWER, msglen)
+                                         SimpleDNSCache.TRUST_ANSWER,
+                                         respinfo)
                         self.dprint(LOGLVL_DEBUG10, 'got a response: %s',
                                     [answer_rrset])
                         if not any_query:
@@ -263,30 +388,27 @@ class ResolverContext:
                             # simply ignore the rest.
                             return None
                     elif answer_rrset.get_type() == RRType.CNAME():
-                        return self.__handle_cname(answer_rrset, msglen)
+                        return self.__handle_cname(answer_rrset, respinfo)
             if found:
                 return None
             raise InternalLame('no answer found in answer section')
         elif resp_msg.get_rcode() == Rcode.NXDOMAIN() or \
                 (resp_msg.get_rcode() == Rcode.NOERROR() and
                  resp_msg.get_rr_count(Message.SECTION_ANSWER) == 0):
-            self.__handle_negative_answer(resp_msg, msglen)
+            self.__handle_negative_answer(resp_msg, respinfo)
             return None
 
         raise InternalLame('unexpected answer rcode=' +
                            str(resp_msg.get_rcode()))
 
-    def __handle_cname(self, cname_rrset, msglen):
+    def __handle_cname(self, cname_rrset, respinfo):
         self.dprint(LOGLVL_DEBUG10, 'got an alias: %s', [cname_rrset])
         # Chase CNAME with a separate resolver context with
         # loop prevention
         if self.__nest > self.CNAME_LOOP_MAX:
             self.dprint(LOGLVL_INFO, 'possible CNAME loop')
             return None
-        if self.__parent is not None:
-            # Don't chase CNAME in an internal fetch context
-            self.dprint(LOGLVL_INFO, 'CNAME in internal fetch')
-            return None
+
         cname = Name(cname_rrset.get_rdata()[0].to_text())
 
         # Examine the current cache: Sometimes it's possisble CNAME has
@@ -302,12 +424,19 @@ class ResolverContext:
                             ' different from cached %s', [cname, cached_cname])
                 cname = cached_cname
         else:
-            self.__cache.add(cname_rrset, SimpleDNSCache.TRUST_ANSWER, msglen)
+            self.__cache.add(cname_rrset, SimpleDNSCache.TRUST_ANSWER,
+                             respinfo)
+
+        if self.__parent is not None:
+            # Don't chase CNAME in an internal fetch context.  Note that we
+            # should still cache the CNAME; otherwise replay would be confused.
+            self.dprint(LOGLVL_INFO, 'CNAME in internal fetch')
+            return None
 
         cname_ctx = ResolverContext(self.__sock4, self.__sock6,
                                     self.__renderer, cname, self.__qclass,
                                     self.__qtype, self.__cache, self.__qtable,
-                                    self.__nest + 1)
+                                    self.__stat, self.__nest + 1)
         cname_ctx.set_debug_level(self.__debug_level)
         (qid, ns_addr) = cname_ctx.start()
         if ns_addr is not None:
@@ -322,8 +451,11 @@ class ResolverContext:
                 cmp_reln = ns_owner.compare(self.__cur_zone).get_relation()
                 if (cmp_reln == NameComparisonResult.SUBDOMAIN or
                     cmp_reln == NameComparisonResult.EQUAL):
+                    if auth_rrset.get_type() == RRType.NS():
+                        self.__check_ns_consistency(auth_rrset)
                     self.__cache.add(auth_rrset,
-                                     SimpleDNSCache.TRUST_AUTHAUTHORITY, 0)
+                                     SimpleDNSCache.TRUST_AUTHAUTHORITY,
+                                     (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE))
                     for ns_rdata in auth_rrset.get_rdata():
                         ns_name = Name(ns_rdata.to_text())
                         cmp_reln = \
@@ -338,10 +470,62 @@ class ResolverContext:
                     if ad_rrset.get_name() == ns_name:
                         self.__cache.add(ad_rrset,
                                          SimpleDNSCache.TRUST_AUTHADDITIONAL,
-                                         0)
+                                         (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE))
                         break
 
-    def __handle_negative_answer(self, resp_msg, msglen):
+    def __check_ns_consistency(self, ns_rrset):
+        # This check could be unnecessarily performed multiple times,
+        # especially if the NS name consists a CNAME loop.  This check will
+        # help prevent redundant fetch.
+        if (self.__cache.find(ns_rrset.get_name(), self.__qclass,
+                              RRType.NS())[1] is not None or
+            self.__cache.find(
+                ns_rrset.get_name(), self.__qclass, RRType.NS(),
+                SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                SimpleDNSCache.TRUST_AUTHAUTHORITY)[1] is not None):
+            return
+
+        _, glue_ns_rrset, _ = \
+            self.__cache.find(ns_rrset.get_name(), self.__qclass, RRType.NS(),
+                              SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                              SimpleDNSCache.TRUST_GLUE)
+        glue_ns_names = []
+        if glue_ns_rrset is not None:
+            glue_ns_names = \
+                [Name(ns.to_text()) for ns in glue_ns_rrset.get_rdata()]
+
+        # Now check the new NS RRset at the higher trust level.  We need to
+        # worry about two cases for the post replay process: the new set
+        # contains a new NS name on which this server doesn't know anything;
+        # all new RRsets are out-of-zone name and this server doesn't know
+        # anything on any of them.  In these cases we invoke additional fetch.
+        # We only check the availability of type A RR for simplicity.
+        # In theory, it could be possible that the server happens to know
+        # something about AAAA, but it's less likely in practice.  Besides,
+        # the additional fetch does not do harm except its overhead.
+        n_unknown_glue = 0
+        new_ns_names = [Name(ns.to_text()) for ns in ns_rrset.get_rdata()]
+        for ns_name in new_ns_names:
+            if self.__cache.find(ns_name,
+                                 self.__qclass, RRType.A())[0] is not None:
+                continue
+            if not ns_name in glue_ns_names:
+                self.dprint(LOGLVL_INFO, 'no addr info in auth NS name: %s',
+                            [ns_name])
+                self.__fetch_ns_addrs(ns_name, self.__aux_fetch_queries, False)
+            else:
+                cmp_reln = self.__cur_zone.compare(ns_name).get_relation()
+                if (cmp_reln != NameComparisonResult.EQUAL and
+                    cmp_reln != NameComparisonResult.SUPERDOMAIN):
+                    n_unknown_glue += 1
+        if n_unknown_glue == len(new_ns_names): # 2nd case above
+            self.dprint(LOGLVL_INFO,
+                        "none of new NS names has addr info at %s",
+                        [ns_rrset.get_name()])
+            for ns_name in new_ns_names:
+                self.__fetch_ns_addrs(ns_name, self.__aux_fetch_queries, False)
+
+    def __handle_negative_answer(self, resp_msg, respinfo):
         rcode = resp_msg.get_rcode()
         if rcode == Rcode.NOERROR():
             rcode = Rcode.NXRRSET()
@@ -351,12 +535,13 @@ class ResolverContext:
                     auth_rrset.get_type() == RRType.SOA():
                 cmp_result = auth_rrset.get_name().compare(self.__qname)
                 cmp_reln = cmp_result.get_relation()
-                if cmp_reln != NameComparisonResult.EQUAL and \
-                        cmp_reln != NameComparisonResult.SUPERDOMAIN:
+                if (cmp_reln != NameComparisonResult.EQUAL and
+                    cmp_reln != NameComparisonResult.SUPERDOMAIN):
                     self.dprint(LOGLVL_INFO, 'bogus SOA name for negative: %s',
                                 [auth_rrset.get_name()])
                     continue
-                self.__cache.add(auth_rrset, SimpleDNSCache.TRUST_ANSWER, 0)
+                self.__cache.add(auth_rrset, SimpleDNSCache.TRUST_ANSWER,
+                                 (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE))
                 neg_ttl = get_soa_ttl(auth_rrset.get_rdata()[0])
                 self.dprint(LOGLVL_DEBUG10,
                             'got a negative response, code=%s, negTTL=%s',
@@ -369,11 +554,12 @@ class ResolverContext:
             neg_ttl = self.DEFAULT_NEGATIVE_TTL
         neg_rrset = RRset(self.__qname, self.__qclass, self.__qtype,
                           RRTTL(neg_ttl))
-        self.__cache.add(neg_rrset, SimpleDNSCache.TRUST_ANSWER, msglen, rcode)
+        self.__cache.add(neg_rrset, SimpleDNSCache.TRUST_ANSWER, respinfo,
+                         rcode)
 
-    def __handle_referral(self, resp_msg, ns_rrset, msglen):
+    def __handle_referral(self, resp_msg, ns_rrset, respinfo):
         self.dprint(LOGLVL_DEBUG10, 'got a referral: %s', [ns_rrset])
-        self.__cache.add(ns_rrset, SimpleDNSCache.TRUST_GLUE, msglen)
+        self.__cache.add(ns_rrset, SimpleDNSCache.TRUST_GLUE, respinfo)
         additionals = resp_msg.get_section(Message.SECTION_ADDITIONAL)
         for ad_rrset in additionals:
             cmp_reln = \
@@ -493,20 +679,21 @@ class ResolverContext:
             else:
                 self.__cur_nameservers = nameservers
                 for ns in ns_names:
-                    self.__fetch_ns_addrs(ns)
+                    self.__fetch_ns_addrs(ns, self.__fetch_queries)
         return (v4_addrs, v6_addrs)
 
-    def __fetch_ns_addrs(self, ns_name):
+    def __fetch_ns_addrs(self, ns_name, fetch_queries, set_parent=True):
         for type in [RRType.A(), RRType.AAAA()]:
             res_ctx = ResolverContext(self.__sock4, self.__sock6,
                                       self.__renderer, ns_name, self.__qclass,
                                       type, self.__cache, self.__qtable,
-                                      self.__nest + 1)
+                                      self.__stat, self.__nest + 1)
             res_ctx.set_debug_level(self.__debug_level)
-            res_ctx.__parent = self
+            if set_parent:
+                res_ctx.__parent = self
             (qid, ns_addr) = res_ctx.start()
             query = ResQuery(res_ctx, qid, ns_addr)
-            self.__fetch_queries.add(query)
+            fetch_queries.add(query)
 
     def __resume_parents(self):
         ctx = self
@@ -524,7 +711,8 @@ class ResolverContext:
             ctx.__parent.dprint(LOGLVL_DEBUG1, 'resumed context failed')
             fail_rrset = RRset(ctx.__parent.__qname, ctx.__parent.__qclass,
                                ctx.__parent.__qtype, RRTTL(self.SERVFAIL_TTL))
-            self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER, 0,
+            self.__cache.add(fail_rrset, SimpleDNSCache.TRUST_ANSWER,
+                             (0, SimpleDNSCache.RESP_FINAL_ANSWER_NONE),
                              Rcode.SERVFAIL())
             # Recursively check grand parents
             ctx = ctx.__parent
@@ -652,6 +840,8 @@ class FileResolver:
         self.__max_ctxts = int(options.max_query)
         self.__dump_file = options.dump_file
         self.__serialize_file = options.serialize_file
+        self.__stat = ResolverStatistics()
+        self.__stat_file = options.stat_file
 
         ResQuery.QUERY_TIMEOUT = int(options.query_timeo)
 
@@ -698,7 +888,7 @@ class FileResolver:
         qname = Name(m.group(3))
         return ResolverContext(self.__sock4, self.__sock6, self.__renderer,
                                qname, qclass, qtype, self.__cache,
-                               self.__query_table)
+                               self.__query_table, self.__stat)
 
     def run(self):
         while self.__check_status():
@@ -733,6 +923,8 @@ class FileResolver:
             self.__cache.dump(self.__dump_file)
         if self.__serialize_file is not None:
             self.__cache.dump(self.__serialize_file, True)
+        if self.__stat_file is not None:
+            self.__stat_dump()
 
     def __handle(self, s):
         pkt, remote = s.recvfrom(4096)
@@ -742,6 +934,7 @@ class FileResolver:
         except Exception as ex:
             self.dprint(LOGLVL_INFO, 'broken packet from %s: %s',
                         [remote[0], ex])
+            self.__stat.response_broken += 1
             return
         self.dprint(LOGLVL_DEBUG10, 'received packet from %s, QID=%s',
                     [remote[0], self.__msg.get_qid()])
@@ -755,7 +948,7 @@ class FileResolver:
             except KeyError as ex:
                 ctx.dprint(LOGLVL_INFO, 'bug: missing context')
                 raise ex
-            res_qry = ctx.handle_response(self.__msg, len(pkt))
+            res_qry = ctx.handle_response(self.__msg, pkt)
             next_queries = [] if res_qry is None else [res_qry]
             next_queries.extend(ctx.get_aux_queries())
             for res_qry in next_queries:
@@ -771,6 +964,7 @@ class FileResolver:
                         [remote[0], self.__msg.get_qid()])
 
     def _qry_timeout(self, res_qry):
+        self.__stat.query_timeout += 1
         del self.__query_table[(res_qry.qid, res_qry.ns_addr)]
         next_res_qry = res_qry.res_ctx.query_timeout(res_qry.ns_addr)
         if next_res_qry is None or next_res_qry.res_ctx != res_qry.res_ctx:
@@ -788,6 +982,10 @@ class FileResolver:
             timer = QueryTimer(self, next_res_qry)
             self.__timerq.add(next_res_qry.expire, timer)
 
+    def __stat_dump(self):
+        with open(self.__stat_file, 'w') as f:
+            self.__stat.dump(f)
+
 def get_option_parser():
     parser = OptionParser(usage='usage: %prog [options] query_file')
     parser.add_option("-6", "--ipv6-only", dest="ipv6_only",
@@ -807,6 +1005,9 @@ def get_option_parser():
                       action="store", default=None,
                       help="if specified, file name to dump the resulting " + \
                           "cache in the serialized binary format")
+    parser.add_option("-S", "--dump-stat", dest="stat_file",
+                      action="store", default=None,
+                      help="if specified, file to dump statistics")
     parser.add_option("-n", "--max-query", dest="max_query", action="store",
                       default="10",
                       help="specify the max # of queries in parallel")
diff --git a/exp/res-research/analysis/query_replay.py b/exp/res-research/analysis/query_replay.py
index 47de267..31c623f 100755
--- a/exp/res-research/analysis/query_replay.py
+++ b/exp/res-research/analysis/query_replay.py
@@ -17,7 +17,7 @@
 
 from isc.dns import *
 import parse_qrylog
-import dns_cache
+from dns_cache import SimpleDNSCache, CacheShell
 
 import datetime
 from optparse import OptionParser
@@ -64,6 +64,24 @@ class QueryTrace:
             hits += log.hits
         return hits
 
+    def get_cache_misses(self):
+        '''Return the total count of cache misses for the query'''
+        misses = 0
+        for log in self.__cache_log:
+            misses += log.misses
+        return misses
+
+    def get_external_query_count(self):
+        '''Return a list of external queries needed for cache update.
+
+        Each list entry is an int that means the number of queries.
+
+        '''
+        counts = []
+        for log in self.__cache_log:
+            counts.append(len(log.resp_list))
+        return counts
+
     def add_cache_log(self, cache_log):
         self.__cache_log.append(cache_log)
 
@@ -72,21 +90,12 @@ class QueryTrace:
             return None
         return self.__cache_log[-1]
 
-    def cache_expired(self, cache, now):
-        '''Check if the cache for this query has expired or is still valid.
+    def update(self, cache, now):
+        '''Update all cached records associated with the query at once.
 
-        For type ANY query, __cache_entries may contain multiple entry IDs.
-        In that case we consider it valid as long as one of them is valid.
+        This is effectively only useful for type ANY query results.
 
         '''
-        expired = 0
-        for cache_entry_id in self.__cache_entries:
-            entry = cache.get(cache_entry_id)
-            if entry.is_expired(now):
-                expired += 1
-        return len(self.__cache_entries) == expired
-
-    def update(self, cache, now):
         for cache_entry_id in self.__cache_entries:
             cache.update(cache_entry_id, now)
 
@@ -96,18 +105,22 @@ class CacheLog:
     __time_created (float): Timestamp when an answer for the query is cached.
     hits (int): number of cache hits
     misses (int): 1 if this is created on cache miss (normal); otherwise 0
-    TBD:
-      number of external queries involved along with the response sizes
+    resp_list [(int, int)]: additional info on external queries needed to
+      create this entry, from deepest (leaf zone) to top (known deepest
+      zone cut toward the query name at the time of resolution).  Each list
+      entry consists of (response_size, response type (RESP_xxx))
 
     '''
-    def __init__(self, now, on_miss=True):
-        self.__time_created = now
+    def __init__(self, now, resp_list, on_miss=True):
+        self.time_last_used = now
         self.hits = 0
         self.misses = 1 if on_miss else 0
+        self.resp_list = resp_list
 
 class ResolverContext:
     '''Emulated resolver context.'''
     FETCH_DEPTH_MAX = 8         # prevent infinite NS fetch
+    override_mode = False       # set by QueryReplay
 
     def __init__(self, qname, qclass, qtype, cache, now, dbg_level, nest=0):
         self.__qname = qname
@@ -146,11 +159,12 @@ class ResolverContext:
             self.__cur_zone = nameservers.get_name()
             self.dprint(LOGLVL_DEBUG10, 'reach a zone cut')
 
-            have_addr, fetch_list = self.__find_ns_addrs(nameservers)
-            if not have_addr:
-                # If fetching NS addresses fail, we should be at the end of
-                # chain
-                found, fetch_resps = self.__fetch_ns_addrs(fetch_list)
+            found_addr, n_addr, fetch_list = self.__find_ns_addrs(nameservers)
+            if n_addr == 0:
+                # If fetching NS addresses fails, we should be at the end of
+                # chain.
+                found, fetch_resps = self.__fetch_ns_addrs(fetch_list,
+                                                           found_addr)
                 resp_list.extend(fetch_resps)
                 if not found:
                     chain = chain[:-1]
@@ -167,7 +181,7 @@ class ResolverContext:
                 break
 
             # Otherwise, go down to the zone one level lower.
-            new_id, nameservers = chain[-1]
+            new_id, nameservers, _ = chain[-1]
             self.dprint(LOGLVL_DEBUG10, 'update NS at zone %s, trust %s',
                         [nameservers.get_name(),
                          self.__cache.get(new_id).trust])
@@ -178,11 +192,13 @@ class ResolverContext:
         # query name)
         self.dprint(LOGLVL_DEBUG5, 'resolution completed')
         self.__cache.update(chain[0][0], self.__now)
+        self.__update_authns(chain[0][1], nameservers)
+        self.__purge_glue(chain[0][1])
         return chain[0][1], True, resp_list
 
-    def __fetch_ns_addrs(self, fetch_list):
+    def __fetch_ns_addrs(self, fetch_list, allskip_ok):
         if self.__nest > self.FETCH_DEPTH_MAX:
-            self.dprint(LOGLVL_INFO, 'reached fetch depth limit, aborted')
+            self.dprint(LOGLVL_DEBUG1, 'reached fetch depth limit, aborted')
             return False, []
 
         self.dprint(LOGLVL_DEBUG10, 'no NS addresses are known, fetch them.')
@@ -192,6 +208,14 @@ class ResolverContext:
             ns_name, addr_type = fetch[1], fetch[0]
 
             # First, check if we know this RR at all in the first place.
+            # If it were subject to CNAME substituation, this name should be
+            # excluded from fetch.
+            if self.__cache.find(ns_name, self.__qclass,
+                                 RRType.CNAME())[0] is not None:
+                self.dprint(LOGLVL_DEBUG1, 'NS name is an alias %s/%s/%s',
+                            [ns_name, self.__qclass, addr_type])
+                continue
+
             # It could happen that in the original resolution the resolver
             # already knew some of the missing addresses as an answer (as a
             # result or side effect of prior resolution) and didn't bother to
@@ -210,7 +234,12 @@ class ResolverContext:
             rrset, updated, resp_list = res_ctx.resolve()
             ret_resp_list.extend(resp_list)
             if not updated:
-                raise QueryReplaceError('assumption failure')
+                # rare case, but this one could have been updated in the
+                # middle of this fetch attempts
+                self.dprint(LOGLVL_DEBUG10,
+                            'NS fetch target has been self updated: %s/%s',
+                            [ns_name, addr_type])
+                                        
             if rrset.get_rdata_count() > 0: # positive result
                 self.dprint(LOGLVL_DEBUG10,
                             'fetching an NS address succeeded for %s/%s/%s',
@@ -220,11 +249,12 @@ class ResolverContext:
                         'fetching an NS address failed for %s/%s/%s',
                         [ns_name, self.__qclass, addr_type])
 
-        # We should be able to try fetching at least one of the requested
-        # addrs.  If not, it means internnal inconsistency.
-        if skipped == len(fetch_list):
-            raise QueryReplaceError('assumption failure in NS fetch for ' +
-                                    '%s/%s' % (ns_name, addr_type))
+        # Normally, We should be able to try fetching at least one of the
+        # requested addrs; if not, it means internnal inconsistency.  The
+        # exception is when the caller knows something *negative* about other
+        # addresses and just checking if there's a hope in missing glue.
+        if not allskip_ok and skipped > 0 and skipped == len(fetch_list):
+            raise QueryReplaceError('assumption failure in NS fetch')
 
         # All attempts fail
         self.dprint(LOGLVL_DEBUG10, 'fetching an NS address failed')
@@ -249,38 +279,130 @@ class ResolverContext:
                 self.dprint(LOGLVL_DEBUG10, 'Update IPv6 glue: %s', [ns_name])
                 self.__cache.update(id, self.__now)
 
+    def __update_authns(self, answer_rrset, ns_rrset):
+        '''Update cached auth NS record that comes with an answer.
+
+        We don't do this if the query is for the NS, in which case the
+        authority NS is normally omitted.  Likewise, we don't do this for
+        negative answers.  It's still not guaranteed the authority section
+        always has the NS with the answer, but the fact that the cache has
+        the information makes it quite likely (at least one server gave it
+        with an answer in the original actual resolution).
+
+        '''
+        if self.__qtype == RRType.NS():
+            return
+        if answer_rrset.get_rdata_count() == 0:
+            return
+
+        id = self.__cache.find(ns_rrset.get_name(), self.__qclass, RRType.NS(),
+                               SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                               SimpleDNSCache.TRUST_AUTHAUTHORITY)[2]
+        if id is not None:
+            self.dprint(LOGLVL_DEBUG10, 'update auth NS')
+            self.__cache.update(id, self.__now)
+
+        if self.override_mode:
+            id = self.__cache.find(ns_rrset.get_name(), self.__qclass,
+                                   RRType.NS(),
+                                   SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                                   SimpleDNSCache.TRUST_GLUE)[2]
+            if id is not None:
+                self.dprint(LOGLVL_DEBUG10, 'purge glue NS')
+                self.__cache.update(id, None)
+
+    def __purge_glue(self, answer_rrset):
+        if (not self.override_mode or
+            (answer_rrset.get_type() != RRType.A() and
+             answer_rrset.get_type() != RRType.AAAA())):
+            return
+
+        id = self.__cache.find(answer_rrset.get_name(), self.__qclass,
+                               answer_rrset.get_type(),
+                               SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                               SimpleDNSCache.TRUST_GLUE)[2]
+        if id is not None:
+            self.dprint(LOGLVL_DEBUG10, 'purge glue record due to answer')
+            self.__cache.update(id, None)
+
     def __get_resolve_chain(self):
         chain = []
         resp_list = []
         rcode, answer_rrset, id = \
             self.__cache.find(self.__qname, self.__qclass, self.__qtype,
-                              dns_cache.SimpleDNSCache.FIND_ALLOW_CNAME |
-                              dns_cache.SimpleDNSCache.FIND_ALLOW_NEGATIVE)
+                              SimpleDNSCache.FIND_ALLOW_CNAME |
+                              SimpleDNSCache.FIND_ALLOW_NEGATIVE)
         entry = self.__cache.get(id)
-        chain.append((id, answer_rrset))
-        resp_list.append(entry.msglen)
+        chain.append((id, answer_rrset, entry))
+        resp_list.append((entry.msglen, entry.resp_type))
         if not entry.is_expired(self.__now):
             return chain, []
 
+        # Build full chain to the root.  parent_zones[i] will be set to the
+        # parent zone for chain[i-1].
+        parent_zones = []
         for l in range(0, self.__qname.get_labelcount()):
             zname = self.__qname.split(l)
-            _, ns_rrset, id = self.__find_delegate_info(zname, RRType.NS())
-            if ns_rrset is None:
+            rcode, ns_rrset, id = self.__find_delegate_info(zname, RRType.NS())
+            if ns_rrset is None or ns_rrset.get_rdata_count() == 0:
+                # this could return a negative result.  we should simply
+                # ignore this case.
                 continue
+            parent_zones.append(zname)
             entry = self.__cache.get(id)
             self.dprint(LOGLVL_DEBUG10, 'build resolve chain at %s, trust %s',
                         [zname, entry.trust])
-            chain.append((id, ns_rrset))
-            if not entry.is_expired(self.__now):
-                return chain, resp_list
-            resp_list.append(entry.msglen)
+            chain.append((id, ns_rrset, entry))
+            resp_list.append((entry.msglen, entry.resp_type))
+
+        # The last entry of parent_zones should be root.  Its parent should
+        # be itself.
+        parent_zones.append(parent_zones[-1])
+
+        # Then find the deepest level where complete delegation information
+        # is available (NS and at least one NS address are active).
+        for i in range(1, len(chain)):
+            entry = chain[i][2]
+            zname = chain[i][1].get_name()
+            if (not entry.is_expired(self.__now) and
+                self.__is_glue_active(zname, parent_zones[i], entry)):
+                self.dprint(LOGLVL_DEBUG10,
+                            'located the deepest active delegtion to %s at %s',
+                            [zname, parent_zones[i]])
+                return chain[:i + 1], resp_list[:i]
 
         # In our setup root server should be always available.
-        raise QueryReplaceError('no name server found for ' + str(qname))
+        raise QueryReplaceError('no name server found for ' +
+                                str(self.__qname))
+
+    def __is_glue_active(self, zname, parent_zname, cache_entry):
+        has_inzone_glue = False
+        for ns_name in [Name(ns.to_text()) for ns in cache_entry.rdata_list]:
+            cmp_reln = parent_zname.compare(ns_name).get_relation()
+            if (cmp_reln == NameComparisonResult.SUPERDOMAIN or
+                cmp_reln == NameComparisonResult.EQUAL):
+                has_inzone_glue = True
+
+            # If an address record is active and cached, we can start
+            # the delegation from this point.
+            for rrtype in [RRType.A(), RRType.AAAA()]:
+                if self.__find_delegate_info(ns_name, rrtype,
+                                             True)[1] is not None:
+                    return True
+
+        # We don't have any usable address record at this point.  If there's
+        # at least one in-zone glue, the replay logic would assume it should
+        # be usable by the replay time.  So we cannot start from this level.
+        if has_inzone_glue:
+            self.dprint(LOGLVL_DEBUG10,
+                        'active NS is found but no usable address at %s',
+                        [zname])
+            return False
+        return True
 
     def __find_ns_addrs(self, nameservers):
-        # We only need to know whether we have at least one usable address:
-        have_address = False
+        found = False      # whether we know any active info about an address
+        n_addrs = 0        # num of actually usable address
 
         # Record any missing address to be fetched.
         fetch_list = []
@@ -290,8 +412,14 @@ class ResolverContext:
             if rrset4 is not None and rrset4.get_rdata_count() > 0:
                 self.dprint(LOGLVL_DEBUG10, 'found %s IPv4 address for NS %s',
                             [rrset4.get_rdata_count(), ns_name])
-                have_address = True
-            elif rcode is None:
+                found = True
+                n_addrs += 1
+            elif rcode is not None:
+                self.dprint(LOGLVL_DEBUG10,
+                            'IPv4 address for NS %s is known to be unusable',
+                            [ns_name])
+                found = True
+            else:
                 fetch_list.append((RRType.A(), ns_name))
 
             rcode, rrset6, id = \
@@ -299,11 +427,17 @@ class ResolverContext:
             if rrset6 is not None and rrset6.get_rdata_count() > 0:
                 self.dprint(LOGLVL_DEBUG10, 'found %s IPv6 address for NS %s',
                             [rrset6.get_rdata_count(), ns_name])
-                have_address = True
-            elif rcode is None:
+                found = True
+                n_addrs += 1
+            elif rcode is not None:
+                self.dprint(LOGLVL_DEBUG10,
+                            'IPv6 address for NS %s is known to be unusable',
+                            [ns_name])
+                found = True
+            else:
                 fetch_list.append((RRType.AAAA(), ns_name))
 
-        return have_address, fetch_list
+        return found, n_addrs, fetch_list
 
     def __find_delegate_info(self, name, rrtype, active_only=False):
         '''Find an RRset from the cache that can be used for delegation.
@@ -319,7 +453,7 @@ class ResolverContext:
         options = 0
         ans_rcode, ans_rrset, ans_id = \
             self.__cache.find(name, self.__qclass, rrtype,
-                              dns_cache.SimpleDNSCache.FIND_ALLOW_NEGATIVE)
+                              SimpleDNSCache.FIND_ALLOW_NEGATIVE)
         if (ans_rcode is not None and
             not self.__cache.get(ans_id).is_expired(self.__now)):
             return ans_rcode, ans_rrset, ans_id
@@ -331,8 +465,8 @@ class ResolverContext:
         if rrtype == RRType.NS():
             rcode, rrset, id = \
                 self.__cache.find(name, self.__qclass, rrtype,
-                                  dns_cache.SimpleDNSCache.FIND_ALLOW_NOANSWER,
-                                  dns_cache.SimpleDNSCache.TRUST_AUTHAUTHORITY)
+                                  SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                                  SimpleDNSCache.TRUST_AUTHAUTHORITY)
             if (rrset is not None and
                 not self.__cache.get(id).is_expired(self.__now)):
                 return rcode, rrset, id
@@ -343,8 +477,8 @@ class ResolverContext:
         # explicitly requested to exclude expired ones.
         rcode, rrset, id = \
             self.__cache.find(name, self.__qclass, rrtype,
-                              dns_cache.SimpleDNSCache.FIND_ALLOW_NOANSWER,
-                              dns_cache.SimpleDNSCache.TRUST_GLUE)
+                              SimpleDNSCache.FIND_ALLOW_NOANSWER,
+                              SimpleDNSCache.TRUST_GLUE)
         if (rrset is not None and
             (not active_only or
              not self.__cache.get(id).is_expired(self.__now))):
@@ -353,8 +487,8 @@ class ResolverContext:
         return None, None, None
 
 class QueryReplay:
-    CACHE_OPTIONS = dns_cache.SimpleDNSCache.FIND_ALLOW_CNAME | \
-        dns_cache.SimpleDNSCache.FIND_ALLOW_NEGATIVE
+    CACHE_OPTIONS = SimpleDNSCache.FIND_ALLOW_CNAME | \
+        SimpleDNSCache.FIND_ALLOW_NEGATIVE
 
     def __init__(self, log_file, cache, dbg_level):
         self.__log_file = log_file
@@ -364,12 +498,25 @@ class QueryReplay:
         self.__queries = {}
         self.__total_queries = 0
         self.__query_params = None
+        self.__override = options.override
         self.__cur_query = None # use for debug out
         self.__dbg_level = int(dbg_level)
         self.__resp_msg = Message(Message.RENDER) # for resp size estimation
         self.__renderer = MessageRenderer()
         self.__rcode_stat = {}  # RCODE value (int) => query counter
         self.__qtype_stat = {} # RR type => query counter
+        self.__extqry1_stat = {} # #-of-extqry for creation => counter
+        self.__extqry_update_stat = {} # #-of-extqry for update => counter
+        self.__extqry_total_stat = {} # total count of the above two (shortcut)
+        self.__extresp_stat = {} # RESP_xxx => counter
+        self.__extresp_stat[0] = 0
+        for resptype in SimpleDNSCache.RESP_DESCRIPTION.keys():
+            self.__extresp_stat[resptype] = 0
+        self.__interactive = options.interactive
+        # Session wide constant for all resolver context instances
+        ResolverContext.override_mode = options.override
+        self.cache_samettl_hits = 0 # #-of cache hits that happen within 1s
+        self.cache_total_hits = 0   # #-of total cache hits (shortcut)
 
     def dprint(self, level, msg, params=[]):
         '''Dump a debug/log message.'''
@@ -393,6 +540,8 @@ class QueryReplay:
                 except Exception as ex:
                     self.dprint(LOGLVL_INFO,
                                 'error (%s) at line: %s', [ex, log_line])
+                    if self.__interactive:
+                        CacheShell(self.__cache).cmdloop()
                     raise ex
         return self.__total_queries, len(self.__queries)
 
@@ -431,15 +580,22 @@ class QueryReplay:
             self.dprint(LOGLVL_DEBUG3,
                         'cache miss, updated with %s messages (%s)',
                         [len(resp_list), resp_list])
-            cache_log = CacheLog(qry_time)
+            cache_log = CacheLog(qry_time, resp_list)
             qinfo.add_cache_log(cache_log)
+            self.__update_extqry_stat(qinfo, resp_list)
+            self.__update_extresp_stat(resp_list)
         else:
             self.dprint(LOGLVL_DEBUG3, 'cache hit')
             cache_log = qinfo.get_last_cache()
             if cache_log is None:
-                cache_log = CacheLog(qry_time, False)
+                cache_log = CacheLog(qry_time, [], False)
                 qinfo.add_cache_log(cache_log)
+            else:
+                if int(cache_log.time_last_used) == int(qry_time):
+                    self.cache_samettl_hits += 1
+                cache_log.time_last_used = qry_time
             cache_log.hits += 1
+            self.cache_total_hits += 1
 
     def __check_expired(self, qinfo, qname, qclass, qtype, now):
         if qtype == RRType.ANY():
@@ -562,12 +718,31 @@ class QueryReplay:
                 key=lambda x: -self.__queries[x].get_query_count())
         return self.__query_params
 
+    def __update_extqry_stat(self, qinfo, resp_list):
+        n_extqry = len(resp_list)
+        stats = [self.__extqry_total_stat]
+        if qinfo.get_cache_misses() == 1:
+            # This is the first creation of this cache entry
+            stats.append(self.__extqry1_stat)
+        else:
+            stats.append(self.__extqry_update_stat)
+
+        for stat in stats:
+            if n_extqry not in stat:
+                stat[n_extqry] = 0
+            stat[n_extqry] += 1
+
+    def __update_extresp_stat(self, resp_list):
+        for resp in resp_list:
+            self.__extresp_stat[resp[1]] += 1
+
     def dump_popularity_stat(self, dump_file):
         cumulative_n_qry = 0
         cumulative_cache_hits = 0
         position = 1
         with open(dump_file, 'w') as f:
-            f.write('position,% in total,hit rate,#CNAME,resp-size\n')
+            f.write(('position,% in total,hit rate,#CNAME,av ext qry,' +
+                     'resp-size\n'))
             for qry_param in self.__get_query_params():
                 qinfo = self.__queries[qry_param]
                 n_queries = qinfo.get_query_count()
@@ -579,9 +754,16 @@ class QueryReplay:
                 cumulative_hit_rate = \
                     (float(cumulative_cache_hits) / cumulative_n_qry) * 100
 
-                f.write('%d,%.2f,%.2f,%d,%d\n' %
+                n_ext_queries_list = qinfo.get_external_query_count()
+                n_ext_queries = 0
+                for n in n_ext_queries_list:
+                    n_ext_queries += n
+
+                f.write('%d,%.2f,%.2f,%d,%.2f,%d\n' %
                         (position, cumulative_percentage, cumulative_hit_rate,
-                         len(qinfo.cname_trace), qinfo.resp_size))
+                         len(qinfo.cname_trace),
+                         float(n_ext_queries) / len(n_ext_queries_list),
+                         qinfo.resp_size))
                 position += 1
 
     def dump_queries(self, dump_file):
@@ -606,12 +788,52 @@ class QueryReplay:
         for qtype in qtypes:
             print('%s: %d' % (qtype, self.__qtype_stat[qtype]))
 
+    def dump_extqry_stat(self, dump_file):
+        stats = [('All', self.__extqry_total_stat),
+                 ('Initial Creation', self.__extqry1_stat),
+                 ('Update', self.__extqry_update_stat)]
+        with open(dump_file, 'w') as f:
+            for stat in stats:
+                total_qry_count = 0
+                total_res_count = 0
+                for res_count, qry_count in stat[1].items():
+                    total_res_count += res_count * qry_count
+                    total_qry_count += qry_count
+                av_count = -1
+                if total_qry_count > 0: # workaround to avoid div by 0
+                    av_count = float(total_res_count) / total_qry_count
+                f.write('%s,average=%.2f,count=%d\n' %
+                        (stat[0], av_count, total_res_count))
+                # dump histogram
+                count_list = list(stat[1].keys())
+                count_list.sort()
+                for count in count_list:
+                    f.write('%d,%d\n' % (count, stat[1][count]))
+
+    def dump_extresp_stat(self):
+        print('Response statistics:\n')
+        descriptions = list(SimpleDNSCache.RESP_DESCRIPTION.keys())
+        descriptions.sort()
+        print('  %s: %d' % ('Unknown', self.__extresp_stat[0]))
+        for type in descriptions:
+            print('  %s: %d' % (SimpleDNSCache.RESP_DESCRIPTION[type],
+                                self.__extresp_stat[type]))
+
+    def dump_ttl_stat(self, dump_file):
+        print('TTL statistics:\n')
+        with open(dump_file, 'w') as f:
+            self.__cache.dump_ttl_stat(f)
+
 def main(log_file, options):
-    cache = dns_cache.SimpleDNSCache()
+    cache = SimpleDNSCache()
     cache.load(options.cache_dbfile)
     replay = QueryReplay(log_file, cache, options.dbg_level)
     total_queries, uniq_queries = replay.replay()
     print('Replayed %d queries (%d unique)' % (total_queries, uniq_queries))
+    print('%d cache hits (%.2f%%), %d at same TTL' %
+          (replay.cache_total_hits,
+           (float(replay.cache_total_hits) / total_queries) * 100,
+           replay.cache_samettl_hits))
     if options.popularity_file is not None:
         replay.dump_popularity_stat(options.popularity_file)
     if options.query_dump_file is not None:
@@ -620,6 +842,12 @@ def main(log_file, options):
         replay.dump_rcode_stat()
     if options.dump_qtype_stat:
         replay.dump_qtype_stat()
+    if options.dump_extqry_file is not None:
+        replay.dump_extqry_stat(options.dump_extqry_file)
+    if options.dump_resp_stat:
+        replay.dump_extresp_stat()
+    if options.ttl_stat_file is not None:
+        replay.dump_ttl_stat(options.ttl_stat_file)
 
 def get_option_parser():
     parser = OptionParser(usage='usage: %prog [options] log_file')
@@ -629,18 +857,35 @@ def get_option_parser():
     parser.add_option("-d", "--dbg-level", dest="dbg_level", action="store",
                       default=0,
                       help="specify the verbosity level of debug output")
+    parser.add_option("-i", "--interactive", dest="interactive",
+                      action="store_true", default=False,
+                      help="enter interactive cache session on finding a bug")
+    parser.add_option("-o", "--overrride",
+                      dest="override", action="store_true",
+                      default=False,
+                      help="run 'override' mode, purging lower rank caches")
     parser.add_option("-p", "--dump-popularity",
                       dest="popularity_file", action="store",
-                      help="dump statistics per query popularity")
+                      help="file to dump statistics per query popularity")
     parser.add_option("-q", "--dump-queries",
                       dest="query_dump_file", action="store",
                       help="dump unique queries")
+    parser.add_option("-Q", "--extqry-stat-file",
+                      dest="dump_extqry_file", action="store",
+                      help="file to dump statistics on external queries")
     parser.add_option("-r", "--dump-rcode-stat",
                       dest="dump_rcode_stat", action="store_true",
                       default=False, help="dump per RCODE statistics")
+    parser.add_option("-R", "--dump-response-stat",
+                      dest="dump_resp_stat", action="store_true",
+                      default=False,
+                      help="dump statistics about external responses")
     parser.add_option("-t", "--dump-qtype-stat",
                       dest="dump_qtype_stat", action="store_true",
                       default=False, help="dump per type statistics")
+    parser.add_option("-T", "--ttl-stat-file",
+                      dest="ttl_stat_file", action="store",
+                      help="dump cache TTL statistics to the file")
     return parser
 
 if __name__ == "__main__":