changeset 24735:07200e3332a1

tags: extract .hgtags filenodes cache to a standalone file Resolution of .hgtags filenodes values has historically been a performance pain point for large repositories, where reading individual manifests can take over 100ms. Multiplied by hundreds or even thousands of heads and resolving .hgtags filenodes becomes a performance issue. This patch establishes a standalone cache file holding the .hgtags filenodes for each changeset. After this patch, the .hgtags filenode for any particular changeset should only have to be computed once during the lifetime of the repository. The introduced hgtagsfnodes1 cache file is modeled after the rev branch cache: the cache is effectively an array of entries consisting of a changeset fragment and the filenode for a revision. The file grows in proportion to the length of the repository (24 bytes per changeset) and is truncated when the repository is stripped. The file is not written unless tag info is requested and tags have changed since last time. This patch partially addresses issue4550. Future patches will split the "tags" cache file into per-filter files and will refactor the cache format to not capture the .hgtags fnodes, as these are now stored in the hgtagsfnodes1 cache. This patch is capable of standing alone. We should not have to wait on the tags cache filter split and format refactor for this patch to land.
author Gregory Szorc <gregory.szorc@gmail.com>
date Wed, 15 Apr 2015 17:42:38 -0400
parents fb6cb1b82f4f
children f2fd087a75ef
files mercurial/tags.py tests/test-tags.t
diffstat 2 files changed, 363 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/tags.py	Wed Apr 15 12:18:05 2015 -0400
+++ b/mercurial/tags.py	Wed Apr 15 17:42:38 2015 -0400
@@ -15,15 +15,36 @@
 import util
 import encoding
 import error
+from array import array
 import errno
 import time
 
-# The tags cache stores information about heads and the history of tags.
+# Tags computation can be expensive and caches exist to make it fast in
+# the common case.
+#
+# The "hgtagsfnodes1" cache file caches the .hgtags filenode values for
+# each revision in the repository. The file is effectively an array of
+# fixed length records. Read the docs for "hgtagsfnodescache" for technical
+# details.
+#
+# The .hgtags filenode cache grows in proportion to the length of the
+# changelog. The file is truncated when the # changelog is stripped.
+#
+# The purpose of the filenode cache is to avoid the most expensive part
+# of finding global tags, which is looking up the .hgtags filenode in the
+# manifest for each head. This can take dozens or over 100ms for
+# repositories with very large manifests. Multiplied by dozens or even
+# hundreds of heads and there is a significant performance concern.
+#
+# The "tags" cache stores information about heads and the history of tags.
 #
 # The cache file consists of two parts. The first part maps head nodes
 # to .hgtags filenodes. The second part is a history of tags. The two
 # parts are separated by an empty line.
 #
+# The filenodes part of "tags" has effectively been superseded by
+# "hgtagsfnodes1." It is being kept around for backwards compatbility.
+#
 # The first part consists of lines of the form:
 #
 #   <headrev> <headnode> [<hgtagsnode>]
@@ -40,11 +61,6 @@
 # written against the current tip is all that is needed to check whether
 # the cache is up to date).
 #
-# The purpose of the filenode cache is to avoid the most expensive part
-# of finding global tags, which is looking up the .hgtags filenode in the
-# manifest for each head. This can take over a minute on repositories
-# that have large manifests and many heads.
-#
 # The second part of the tags cache consists of lines of the form:
 #
 #   <node> <tag>
@@ -324,20 +340,19 @@
     # This is the most expensive part of finding tags, so performance
     # depends primarily on the size of newheads.  Worst case: no cache
     # file, so newheads == repoheads.
+    fnodescache = hgtagsfnodescache(repo.unfiltered())
     for head in reversed(newheads):
-        cctx = repo[head]
-        try:
-            fnode = cctx.filenode('.hgtags')
+        fnode = fnodescache.getfnode(head)
+        if fnode != nullid:
             cachefnode[head] = fnode
-        except error.LookupError:
-            # no .hgtags file on this head
-            pass
+
+    fnodescache.write()
 
     duration = time.time() - starttime
     ui.log('tagscache',
-           'resolved %d tags cache entries from %d manifests in %0.4f '
+           '%d/%d cache hits/lookups in %0.4f '
            'seconds\n',
-           len(cachefnode), len(newheads), duration)
+           fnodescache.hitcount, fnodescache.lookupcount, duration)
 
     # Caller has to iterate over all heads, but can use the filenodes in
     # cachefnode to get to each .hgtags revision quickly.
@@ -388,3 +403,153 @@
         cachefile.close()
     except (OSError, IOError):
         pass
+
+_fnodescachefile = 'cache/hgtagsfnodes1'
+_fnodesrecsize = 4 + 20 # changeset fragment + filenode
+_fnodesmissingrec = '\xff' * 24
+
+class hgtagsfnodescache(object):
+    """Persistent cache mapping revisions to .hgtags filenodes.
+
+    The cache is an array of records. Each item in the array corresponds to
+    a changelog revision. Values in the array contain the first 4 bytes of
+    the node hash and the 20 bytes .hgtags filenode for that revision.
+
+    The first 4 bytes are present as a form of verification. Repository
+    stripping and rewriting may change the node at a numeric revision in the
+    changelog. The changeset fragment serves as a verifier to detect
+    rewriting. This logic is shared with the rev branch cache (see
+    branchmap.py).
+
+    The instance holds in memory the full cache content but entries are
+    only parsed on read.
+
+    Instances behave like lists. ``c[i]`` works where i is a rev or
+    changeset node. Missing indexes are populated automatically on access.
+    """
+    def __init__(self, repo):
+        assert repo.filtername is None
+
+        self._repo = repo
+
+        # Only for reporting purposes.
+        self.lookupcount = 0
+        self.hitcount = 0
+
+        self._raw = array('c')
+
+        data = repo.vfs.tryread(_fnodescachefile)
+        self._raw.fromstring(data)
+
+        # The end state of self._raw is an array that is of the exact length
+        # required to hold a record for every revision in the repository.
+        # We truncate or extend the array as necessary. self._dirtyoffset is
+        # defined to be the start offset at which we need to write the output
+        # file. This offset is also adjusted when new entries are calculated
+        # for array members.
+        cllen = len(repo.changelog)
+        wantedlen = cllen * _fnodesrecsize
+        rawlen = len(self._raw)
+
+        self._dirtyoffset = None
+
+        if rawlen < wantedlen:
+            self._dirtyoffset = rawlen
+            self._raw.extend('\xff' * (wantedlen - rawlen))
+        elif rawlen > wantedlen:
+            # There's no easy way to truncate array instances. This seems
+            # slightly less evil than copying a potentially large array slice.
+            for i in range(rawlen - wantedlen):
+                self._raw.pop()
+            self._dirtyoffset = len(self._raw)
+
+    def getfnode(self, node):
+        """Obtain the filenode of the .hgtags file at a specified revision.
+
+        If the value is in the cache, the entry will be validated and returned.
+        Otherwise, the filenode will be computed and returned.
+
+        If an .hgtags does not exist at the specified revision, nullid is
+        returned.
+        """
+        ctx = self._repo[node]
+        rev = ctx.rev()
+
+        self.lookupcount += 1
+
+        offset = rev * _fnodesrecsize
+        record = self._raw[offset:offset + _fnodesrecsize].tostring()
+        properprefix = node[0:4]
+
+        # Validate and return existing entry.
+        if record != _fnodesmissingrec:
+            fileprefix = record[0:4]
+
+            if fileprefix == properprefix:
+                self.hitcount += 1
+                return record[4:]
+
+            # Fall through.
+
+        # If we get here, the entry is either missing or invalid. Populate it.
+        try:
+            fnode = ctx.filenode('.hgtags')
+        except error.LookupError:
+            # No .hgtags file on this revision.
+            fnode = nullid
+
+        # Slices on array instances only accept other array.
+        entry = array('c', properprefix + fnode)
+        self._raw[offset:offset + _fnodesrecsize] = entry
+        # self._dirtyoffset could be None.
+        self._dirtyoffset = min(self._dirtyoffset, offset) or 0
+
+        return fnode
+
+    def write(self):
+        """Perform all necessary writes to cache file.
+
+        This may no-op if no writes are needed or if a write lock could
+        not be obtained.
+        """
+        if self._dirtyoffset is None:
+            return
+
+        data = self._raw[self._dirtyoffset:]
+        if not data:
+            return
+
+        repo = self._repo
+
+        try:
+            lock = repo.wlock(wait=False)
+        except error.LockHeld:
+            repo.ui.log('tagscache',
+                        'not writing .hg/%s because lock held\n' %
+                        (_fnodescachefile))
+            return
+
+        try:
+            try:
+                f = repo.vfs.open(_fnodescachefile, 'ab')
+                try:
+                    # if the file has been truncated
+                    actualoffset = f.tell()
+                    if actualoffset < self._dirtyoffset:
+                        self._dirtyoffset = actualoffset
+                        data = self._raw[self._dirtyoffset:]
+                    f.seek(self._dirtyoffset)
+                    f.truncate()
+                    repo.ui.log('tagscache',
+                                'writing %d bytes to %s\n' % (
+                                len(data), _fnodescachefile))
+                    f.write(data)
+                    self._dirtyoffset = None
+                finally:
+                    f.close()
+            except (IOError, OSError), inst:
+                repo.ui.log('tagscache',
+                            "couldn't write %s: %s\n" % (
+                            _fnodescachefile, inst))
+        finally:
+            lock.release()
--- a/tests/test-tags.t	Wed Apr 15 12:18:05 2015 -0400
+++ b/tests/test-tags.t	Wed Apr 15 17:42:38 2015 -0400
@@ -12,6 +12,10 @@
   >   [ -f .hg/cache/tags ] && echo "tag cache exists" || echo "no tag cache"
   > }
 
+  $ fnodescacheexists() {
+  >   [ -f .hg/cache/hgtagsfnodes1 ] && echo "fnodes cache exists" || echo "no fnodes cache"
+  > }
+
   $ dumptags() {
   >     rev=$1
   >     echo "rev $rev: .hgtags:"
@@ -28,10 +32,14 @@
   $ cd t
   $ cacheexists
   no tag cache
+  $ fnodescacheexists
+  no fnodes cache
   $ hg id
   000000000000 tip
   $ cacheexists
   no tag cache
+  $ fnodescacheexists
+  no fnodes cache
   $ echo a > a
   $ hg add a
   $ hg commit -m "test"
@@ -41,6 +49,10 @@
   acb14030fe0a tip
   $ cacheexists
   tag cache exists
+No fnodes cache because .hgtags file doesn't exist
+(this is an implementation detail)
+  $ fnodescacheexists
+  no fnodes cache
 
 Try corrupting the cache
 
@@ -49,6 +61,8 @@
   acb14030fe0a tip
   $ cacheexists
   tag cache exists
+  $ fnodescacheexists
+  no fnodes cache
   $ hg identify
   acb14030fe0a tip
 
@@ -74,33 +88,75 @@
   $ hg identify
   b9154636be93 tip
 
+We should have a fnodes cache now that we have a real tag
+The cache should have an empty entry for rev 0 and a valid entry for rev 1.
+
+
+  $ fnodescacheexists
+  fnodes cache exists
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
 Repeat with cold tag cache:
 
-  $ rm -f .hg/cache/tags
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
   $ hg identify
   b9154636be93 tip
 
+  $ fnodescacheexists
+  fnodes cache exists
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
 And again, but now unable to write tag cache:
 
 #if unix-permissions
-  $ rm -f .hg/cache/tags
-  $ chmod 555 .hg
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
+  $ chmod 555 .hg/cache
   $ hg identify
   b9154636be93 tip
-  $ chmod 755 .hg
+  $ chmod 755 .hg/cache
 #endif
 
 Tag cache debug info written to blackbox log
 
-  $ rm -f .hg/cache/tags
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
   $ hg identify
   b9154636be93 tip
-  $ hg blackbox -l 4
+  $ hg blackbox -l 5
   1970/01/01 00:00:00 bob> identify
-  1970/01/01 00:00:00 bob> resolved 1 tags cache entries from 1 manifests in ?.???? seconds (glob)
+  1970/01/01 00:00:00 bob> writing 48 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
   1970/01/01 00:00:00 bob> writing tags cache file with 1 heads and 1 tags
   1970/01/01 00:00:00 bob> identify exited 0 after ?.?? seconds (glob)
 
+Failure to acquire lock results in no write
+
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
+  $ echo 'foo:1' > .hg/wlock
+  $ hg identify
+  b9154636be93 tip
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> identify
+  1970/01/01 00:00:00 bob> not writing .hg/cache/hgtagsfnodes1 because lock held
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 1 heads and 1 tags
+  1970/01/01 00:00:00 bob> identify exited 0 after * seconds (glob)
+
+  $ fnodescacheexists
+  no fnodes cache
+
+  $ rm .hg/wlock
+
+  $ rm -f .hg/cache/tags .hg/cache/hgtagsfnodes1
+  $ hg identify
+  b9154636be93 tip
 
 Create a branch:
 
@@ -121,9 +177,29 @@
   $ hg add b
   $ hg commit -m "branch"
   created new head
+
+Creating a new commit shouldn't append the .hgtags fnodes cache until
+tags info is accessed
+
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=48
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+
   $ hg id
   c8edf04160c7 tip
 
+First 4 bytes of record 3 are changeset fragment
+
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=72
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff b9 15 46 36 26 b7 b4 a7 |..........F6&...|
+  0020: 73 e0 9e e3 c5 2f 51 0e 19 e0 5e 1f f9 66 d8 59 |s..../Q...^..f.Y|
+  0030: c8 ed f0 41 00 00 00 00 00 00 00 00 00 00 00 00 |...A............|
+  0040: 00 00 00 00 00 00 00 00                         |........|
+
 Merge the two heads:
 
   $ hg merge 1
@@ -244,6 +320,107 @@
   bbd179dfa0a71671c253b3ae0aa1513b60d199fa bar
   78391a272241d70354aa14c874552cad6b51bb42 bar
 
+  $ f --size --hexdump .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=120
+  0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0010: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0020: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
+  0030: 7a 94 12 77 0c 04 f2 a8 af 31 de 17 fa b7 42 28 |z..w.....1....B(|
+  0040: 78 ee 5a 2d ad bc 94 3d 6f a4 50 21 7d 3b 71 8c |x.Z-...=o.P!};q.|
+  0050: 96 4e f3 7b 89 e5 50 eb da fd 57 89 e7 6c e1 b0 |.N.{..P...W..l..|
+  0060: 0c 19 2d 7d 0c 04 f2 a8 af 31 de 17 fa b7 42 28 |..-}.....1....B(|
+  0070: 78 ee 5a 2d ad bc 94 3d                         |x.Z-...=|
+
+Corrupt the .hgtags fnodes cache
+Extra junk data at the end should get overwritten on next cache update
+
+  $ echo extra >> .hg/cache/hgtagsfnodes1
+  $ echo dummy1 > foo
+  $ hg commit -m throwaway1
+
+  $ hg tags
+  tip                                5:8dbfe60eff30
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+#if unix-permissions no-root
+Errors writing to .hgtags fnodes cache are silently ignored
+
+  $ echo dummy2 > foo
+  $ hg commit -m throwaway2
+
+  $ chmod a-w .hg/cache/hgtagsfnodes1
+  $ rm -f .hg/cache/tags
+
+  $ hg tags
+  tip                                6:b968051b5cf3
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> couldn't write cache/hgtagsfnodes1: [Errno 13] Permission denied: '$TESTTMP/t2/.hg/cache/hgtagsfnodes1'
+  1970/01/01 00:00:00 bob> 2/3 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ chmod a+w .hg/cache/hgtagsfnodes1
+#endif
+
+  $ rm -f .hg/cache/tags
+  $ hg tags
+  tip                                6:b968051b5cf3
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 2/3 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=168
+
+Stripping doesn't truncate the tags cache until new data is available
+
+  $ hg -q --config extensions.strip= strip -r 5 --no-backup
+  $ hg tags
+  tip                                4:0c192d7d5e6b
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 4
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> 1/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=168
+
+  $ echo dummy > foo
+  $ hg commit -m throwaway3
+
+  $ hg tags
+  tip                                5:035f65efb448
+  bar                                1:78391a272241
+
+  $ hg blackbox -l 5
+  1970/01/01 00:00:00 bob> tags
+  1970/01/01 00:00:00 bob> writing 24 bytes to cache/hgtagsfnodes1
+  1970/01/01 00:00:00 bob> 0/1 cache hits/lookups in * seconds (glob)
+  1970/01/01 00:00:00 bob> writing tags cache file with 3 heads and 1 tags
+  1970/01/01 00:00:00 bob> tags exited 0 after * seconds (glob)
+  $ f --size .hg/cache/hgtagsfnodes1
+  .hg/cache/hgtagsfnodes1: size=144
+
+  $ hg -q --config extensions.strip= strip -r 5 --no-backup
+
 Test tag removal:
 
   $ hg tag --remove bar     # rev 5