Implement revlogng.
authormason@suse.com
Tue, 04 Apr 2006 16:38:43 -0400
changeset 2072 74d3f5336b66
parent 2042 a514c7509fa9
child 2073 1e6745f78989
Implement revlogng. revlogng results in smaller indexes, can address larger data files, and supports flags and version numbers. By default the original revlog format is used. To use the new format, use the following .hgrc field: [revlog] # format choices are 0 (classic revlog format) and 1 revlogng format=1
mercurial/changelog.py
mercurial/commands.py
mercurial/filelog.py
mercurial/localrepo.py
mercurial/manifest.py
mercurial/revlog.py
mercurial/statichttprepo.py
mercurial/ui.py
--- a/mercurial/changelog.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/changelog.py	Tue Apr 04 16:38:43 2006 -0400
@@ -11,8 +11,9 @@
 demandload(globals(), "os time util")
 
 class changelog(revlog):
-    def __init__(self, opener):
-        revlog.__init__(self, opener, "00changelog.i", "00changelog.d")
+    def __init__(self, opener, defversion=0):
+        revlog.__init__(self, opener, "00changelog.i", "00changelog.d",
+                        defversion)
 
     def extract(self, text):
         if not text:
--- a/mercurial/commands.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/commands.py	Tue Apr 04 16:38:43 2006 -0400
@@ -1268,7 +1268,7 @@
 
 def debugancestor(ui, index, rev1, rev2):
     """find the ancestor revision of two revisions in a given index"""
-    r = revlog.revlog(util.opener(os.getcwd(), audit=False), index, "")
+    r = revlog.revlog(util.opener(os.getcwd(), audit=False), index, "", 0)
     a = r.ancestor(r.lookup(rev1), r.lookup(rev2))
     ui.write("%d:%s\n" % (r.rev(a), hex(a)))
 
@@ -1372,7 +1372,7 @@
 def debugdata(ui, file_, rev):
     """dump the contents of an data file revision"""
     r = revlog.revlog(util.opener(os.getcwd(), audit=False),
-                      file_[:-2] + ".i", file_)
+                      file_[:-2] + ".i", file_, 0)
     try:
         ui.write(r.revision(r.lookup(rev)))
     except KeyError:
@@ -1380,18 +1380,19 @@
 
 def debugindex(ui, file_):
     """dump the contents of an index file"""
-    r = revlog.revlog(util.opener(os.getcwd(), audit=False), file_, "")
+    r = revlog.revlog(util.opener(os.getcwd(), audit=False), file_, "", 0)
     ui.write("   rev    offset  length   base linkrev" +
              " nodeid       p1           p2\n")
     for i in range(r.count()):
-        e = r.index[i]
+        node = r.node(i)
+        pp = r.parents(node)
         ui.write("% 6d % 9d % 7d % 6d % 7d %s %s %s\n" % (
-                i, e[0], e[1], e[2], e[3],
-            short(e[6]), short(e[4]), short(e[5])))
+                i, r.start(i), r.length(i), r.base(i), r.linkrev(node),
+            short(node), short(pp[0]), short(pp[1])))
 
 def debugindexdot(ui, file_):
     """dump an index DAG as a .dot file"""
-    r = revlog.revlog(util.opener(os.getcwd(), audit=False), file_, "")
+    r = revlog.revlog(util.opener(os.getcwd(), audit=False), file_, "", 0)
     ui.write("digraph G {\n")
     for i in range(r.count()):
         e = r.index[i]
--- a/mercurial/filelog.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/filelog.py	Tue Apr 04 16:38:43 2006 -0400
@@ -11,10 +11,11 @@
 demandload(globals(), "bdiff")
 
 class filelog(revlog):
-    def __init__(self, opener, path):
+    def __init__(self, opener, path, defversion=0):
         revlog.__init__(self, opener,
                         os.path.join("data", self.encodedir(path + ".i")),
-                        os.path.join("data", self.encodedir(path + ".d")))
+                        os.path.join("data", self.encodedir(path + ".d")),
+                        defversion)
 
     # This avoids a collision between a file named foo and a dir named
     # foo.i or foo.d
--- a/mercurial/localrepo.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/localrepo.py	Tue Apr 04 16:38:43 2006 -0400
@@ -10,8 +10,8 @@
 from node import *
 from i18n import gettext as _
 from demandload import *
-demandload(globals(), "re lock transaction tempfile stat mdiff errno ui")
 demandload(globals(), "appendfile changegroup")
+demandload(globals(), "re lock transaction tempfile stat mdiff errno ui revlog")
 
 class localrepository(object):
     def __del__(self):
@@ -35,8 +35,20 @@
         self.ui = ui.ui(parentui=parentui)
         self.opener = util.opener(self.path)
         self.wopener = util.opener(self.root)
-        self.manifest = manifest.manifest(self.opener)
-        self.changelog = changelog.changelog(self.opener)
+
+        try:
+            self.ui.readconfig(self.join("hgrc"), self.root)
+        except IOError:
+            pass
+
+        v = self.ui.revlogopts
+        self.revlogversion = int(v.get('format', 0))
+        for x in v.get('flags', "").split():
+            self.revlogversion |= revlog.flagstr(x)
+
+        self.manifest = manifest.manifest(self.opener, self.revlogversion)
+        self.changelog = changelog.changelog(self.opener, self.revlogversion)
+        self.revlogversion = self.changelog.version
         self.tagscache = None
         self.nodetagscache = None
         self.encodepats = None
@@ -48,11 +60,6 @@
             os.mkdir(self.join("data"))
 
         self.dirstate = dirstate.dirstate(self.opener, self.ui, self.root)
-        try:
-            self.ui.readconfig(self.join("hgrc"), self.root)
-        except IOError:
-            pass
-
     def hook(self, name, throw=False, **args):
         def runhook(name, cmd):
             self.ui.note(_("running hook %s: %s\n") % (name, cmd))
@@ -167,7 +174,7 @@
     def file(self, f):
         if f[0] == '/':
             f = f[1:]
-        return filelog.filelog(self.opener, f)
+        return filelog.filelog(self.opener, f, self.revlogversion)
 
     def getcwd(self):
         return self.dirstate.getcwd()
--- a/mercurial/manifest.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/manifest.py	Tue Apr 04 16:38:43 2006 -0400
@@ -12,10 +12,11 @@
 demandload(globals(), "bisect array")
 
 class manifest(revlog):
-    def __init__(self, opener):
+    def __init__(self, opener, defversion=0):
         self.mapcache = None
         self.listcache = None
-        revlog.__init__(self, opener, "00manifest.i", "00manifest.d")
+        revlog.__init__(self, opener, "00manifest.i", "00manifest.d",
+                        defversion)
 
     def read(self, node):
         if node == nullid: return {} # don't upset local cache
--- a/mercurial/revlog.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/revlog.py	Tue Apr 04 16:38:43 2006 -0400
@@ -16,6 +16,10 @@
 demandload(globals(), "binascii changegroup errno heapq mdiff os")
 demandload(globals(), "sha struct zlib")
 
+# revlog version strings
+REVLOGV0 = 0
+REVLOGNG = 1
+
 def hash(text, p1, p2):
     """generate a hash from the given text and its parent hashes
 
@@ -51,7 +55,19 @@
     if t == 'u': return bin[1:]
     raise RevlogError(_("unknown compression type %r") % t)
 
-indexformat = ">4l20s20s20s"
+indexformatv0 = ">4l20s20s20s"
+# index ng:
+# 6 bytes offset
+# 2 bytes flags
+# 4 bytes compressed length
+# 4 bytes uncompressed length
+# 4 bytes: base rev
+# 4 bytes link rev
+# 4 bytes parent 1 rev
+# 4 bytes parent 2 rev
+# 32 bytes: nodeid
+indexformatng = ">Qiiiiii20s12x"
+versionformat = ">i"
 
 class lazyparser(object):
     """
@@ -63,18 +79,16 @@
     the lazy objects in revlog with the underlying objects for
     efficiency in cases where we look at most of the nodes.
     """
-    def __init__(self, data, revlog):
+    def __init__(self, data, revlog, indexformat):
         self.data = data
         self.s = struct.calcsize(indexformat)
+        self.indexformat = indexformat
         self.l = len(data)/self.s
         self.index = [None] * self.l
         self.map = {nullid: -1}
         self.all = 0
         self.revlog = revlog
 
-    def trunc(self, pos):
-        self.l = pos/self.s
-
     def load(self, pos=None):
         if self.all: return
         if pos is not None:
@@ -89,10 +103,11 @@
             self.revlog.nodemap = self.map
 
         while i < end:
-            d = self.data[i * self.s: (i + 1) * self.s]
-            e = struct.unpack(indexformat, d)
-            self.index[i] = e
-            self.map[e[6]] = i
+            if not self.index[i]:
+                d = self.data[i * self.s: (i + 1) * self.s]
+                e = struct.unpack(self.indexformat, d)
+                self.index[i] = e
+                self.map[e[-1]] = i
             i += 1
 
 class lazyindex(object):
@@ -108,12 +123,12 @@
         return self.p.index[pos]
     def __getitem__(self, pos):
         return self.p.index[pos] or self.load(pos)
+    def __setitem__(self, pos, item):
+        self.p.index[pos] = item
     def __delitem__(self, pos):
         del self.p.index[pos]
     def append(self, e):
         self.p.index.append(e)
-    def trunc(self, pos):
-        self.p.trunc(pos)
 
 class lazymap(object):
     """a lazy version of the node map"""
@@ -133,10 +148,10 @@
         yield nullid
         for i in xrange(self.p.l):
             try:
-                yield self.p.index[i][6]
+                yield self.p.index[i][-1]
             except:
                 self.p.load(i)
-                yield self.p.index[i][6]
+                yield self.p.index[i][-1]
     def __getitem__(self, key):
         try:
             return self.p.map[key]
@@ -178,7 +193,7 @@
     remove data, and can use some simple techniques to avoid the need
     for locking while reading.
     """
-    def __init__(self, opener, indexfile, datafile):
+    def __init__(self, opener, indexfile, datafile, defversion=0):
         """
         create a revlog object
 
@@ -192,11 +207,14 @@
         self.indexstat = None
         self.cache = None
         self.chunkcache = None
+        self.defversion = defversion
         self.load()
 
     def load(self):
+        v = self.defversion
         try:
             f = self.opener(self.indexfile)
+            i = f.read()
         except IOError, inst:
             if inst.errno != errno.ENOENT:
                 raise
@@ -213,56 +231,103 @@
                     and st.st_mtime == oldst.st_mtime
                     and st.st_ctime == oldst.st_ctime):
                     return
-            self.indexstat = st
-            i = f.read()
+                self.indexstat = st
+                if len(i) > 0:
+                    v = struct.unpack(versionformat, i[:4])[0]
+                    if v != 0:
+                        flags = v & ~0xFFFF
+                        fmt = v & 0xFFFF
+                        if fmt != REVLOGNG or (flags & ~(REVLOGNGINLINEDATA)):
+                            raise RevlogError(
+                             _("unknown version format %d or flags %x on %s") %
+                             (v, flags, self.indexfile))
+        self.version = v
+        if v == 0:
+            self.indexformat = indexformatv0
+        else:
+            self.indexformat = indexformatng
 
-        if i and i[:4] != "\0\0\0\0":
-            raise RevlogError(_("incompatible revlog signature on %s") %
-                              self.indexfile)
-
-        if len(i) > 10000:
-            # big index, let's parse it on demand
-            parser = lazyparser(i, self)
-            self.index = lazyindex(parser)
-            self.nodemap = lazymap(parser)
+        if i:
+            if st and st.st_size > 10000:
+                # big index, let's parse it on demand
+                parser = lazyparser(i, self, self.indexformat)
+                self.index = lazyindex(parser)
+                self.nodemap = lazymap(parser)
+            else:
+                self.parseindex(i)
+            if self.version != 0:
+                e = list(self.index[0])
+                type = self.ngtype(e[0])
+                e[0] = self.offset_type(0, type)
+                self.index[0] = e
         else:
-            s = struct.calcsize(indexformat)
-            l = len(i) / s
-            self.index = [None] * l
-            m = [None] * l
+            self.nodemap = { nullid: -1}
+            self.index = []
+
+
+    def parseindex(self, data):
+        s = struct.calcsize(self.indexformat)
+        l = len(data)
+        self.index = []
+        self.nodemap =  {nullid: -1}
+        off = 0
+        n = 0
+        while off < l:
+            e = struct.unpack(self.indexformat, data[off:off + s])
+            self.index.append(e)
+            self.nodemap[e[-1]] = n
+            n += 1
+            off += s
 
-            n = 0
-            for f in xrange(0, l * s, s):
-                # offset, size, base, linkrev, p1, p2, nodeid
-                e = struct.unpack(indexformat, i[f:f + s])
-                m[n] = (e[6], n)
-                self.index[n] = e
-                n += 1
+    def ngoffset(self, q):
+        if q & 0xFFFF:
+            raise RevlogError(_('%s: incompatible revision flag %x') %
+                               (self.indexfile, type))
+        return long(q >> 16)
+
+    def ngtype(self, q):
+        return int(q & 0xFFFF)
 
-            self.nodemap = dict(m)
-            self.nodemap[nullid] = -1
+    def offset_type(self, offset, type):
+        return long(long(offset) << 16 | type)
+
+    def loadindexmap(self):
+        """loads both the map and the index from the lazy parser"""
+        if isinstance(self.index, lazyindex):
+            p = self.index.p
+            p.load()
 
     def tip(self): return self.node(len(self.index) - 1)
     def count(self): return len(self.index)
-    def node(self, rev): return (rev < 0) and nullid or self.index[rev][6]
+    def node(self, rev):
+        return (rev < 0) and nullid or self.index[rev][-1]
     def rev(self, node):
         try:
             return self.nodemap[node]
         except KeyError:
             raise RevlogError(_('%s: no node %s') % (self.indexfile, hex(node)))
-    def linkrev(self, node): return self.index[self.rev(node)][3]
+    def linkrev(self, node): return self.index[self.rev(node)][-4]
     def parents(self, node):
         if node == nullid: return (nullid, nullid)
-        return self.index[self.rev(node)][4:6]
+        r = self.rev(node)
+        d = self.index[r][-3:-1]
+        if self.version == 0:
+            return d
+        return [ self.node(x) for x in d ]
+    def start(self, rev):
+        if rev < 0:
+            return -1
+        if self.version != 0:
+            return self.ngoffset(self.index[rev][0])
+        return self.index[rev][0]
+    def end(self, rev): return self.start(rev) + self.length(rev)
 
-    def start(self, rev): return (rev < 0) and -1 or self.index[rev][0]
     def length(self, rev):
         if rev < 0:
             return 0
         else:
             return self.index[rev][1]
-    def end(self, rev): return self.start(rev) + self.length(rev)
-    def base(self, rev): return (rev < 0) and rev or self.index[rev][2]
+    def base(self, rev): return (rev < 0) and rev or self.index[rev][-5]
 
     def reachable(self, rev, stop=None):
         reachable = {}
@@ -501,18 +566,18 @@
         """apply a list of patches to a string"""
         return mdiff.patches(t, pl)
 
-    def chunk(self, rev):
+    def chunk(self, rev, df=None, cachelen=4096):
         start, length = self.start(rev), self.length(rev)
         end = start + length
-
-        def loadcache():
-            cache_length = max(4096 * 1024, length) # 4Mo
-            df = self.opener(self.datafile)
+        def loadcache(df):
+            cache_length = max(cachelen, length) # 4k
+            if not df:
+                df = self.opener(self.datafile)
             df.seek(start)
             self.chunkcache = (start, df.read(cache_length))
 
         if not self.chunkcache:
-            loadcache()
+            loadcache(df)
 
         cache_start = self.chunkcache[0]
         cache_end = cache_start + len(self.chunkcache[1])
@@ -520,7 +585,7 @@
             # it is cached
             offset = start - cache_start
         else:
-            loadcache()
+            loadcache(df)
             offset = 0
 
         #def checkchunk():
@@ -555,16 +620,18 @@
         rev = self.rev(node)
         base = self.base(rev)
 
+        df = self.opener(self.datafile)
+
         # do we have useful data cached?
         if self.cache and self.cache[1] >= base and self.cache[1] < rev:
             base = self.cache[1]
             text = self.cache[2]
         else:
-            text = self.chunk(base)
+            text = self.chunk(base, df=df)
 
         bins = []
         for r in xrange(base + 1, rev + 1):
-            bins.append(self.chunk(r))
+            bins.append(self.chunk(r, df=df))
 
         text = self.patches(text, bins)
 
@@ -621,19 +688,30 @@
         if t >= 0:
             offset = self.end(t)
 
-        e = (offset, l, base, link, p1, p2, node)
+        if self.version == 0:
+            e = (offset, l, base, link, p1, p2, node)
+        else:
+            e = (self.offset_type(offset, 0), l, len(text),
+                 base, link, self.rev(p1), self.rev(p2), node)
 
         self.index.append(e)
         self.nodemap[node] = n
-        entry = struct.pack(indexformat, *e)
+        entry = struct.pack(self.indexformat, *e)
 
-        transaction.add(self.datafile, e[0])
+        transaction.add(self.datafile, offset)
+        transaction.add(self.indexfile, n * len(entry))
         f = self.opener(self.datafile, "a")
         if data[0]:
             f.write(data[0])
         f.write(data[1])
-        transaction.add(self.indexfile, n * len(entry))
-        self.opener(self.indexfile, "a").write(entry)
+        f = self.opener(self.indexfile, "a")
+
+        if len(self.index) == 1 and self.version != 0:
+            l = struct.pack(versionformat, self.version)
+            f.write(l)
+            entry = entry[4:]
+
+        f.write(entry)
 
         self.cache = (node, n, text)
         return node
@@ -748,16 +826,12 @@
         base = prev = -1
         start = end = measure = 0
         if r:
-            base = self.base(t)
-            start = self.start(base)
             end = self.end(t)
-            measure = self.length(base)
-            prev = self.tip()
 
+        ifh = self.opener(self.indexfile, "a+")
+        transaction.add(self.indexfile, ifh.tell())
         transaction.add(self.datafile, end)
-        transaction.add(self.indexfile, r * struct.calcsize(indexformat))
         dfh = self.opener(self.datafile, "a")
-        ifh = self.opener(self.indexfile, "a")
 
         # loop through our set of deltas
         chain = None
@@ -794,7 +868,8 @@
 
             if chain != prev or (end - start + len(cdelta)) > measure * 2:
                 # flush our writes here so we can read it in revision
-                dfh.flush()
+                if dfh:
+                    dfh.flush()
                 ifh.flush()
                 text = self.revision(chain)
                 text = self.patches(text, [delta])
@@ -803,19 +878,21 @@
                     raise RevlogError(_("consistency error adding group"))
                 measure = len(text)
             else:
-                e = (end, len(cdelta), base, link, p1, p2, node)
+                if self.version == 0:
+                    e = (end, len(cdelta), base, link, p1, p2, node)
+                else:
+                    e = (self.offset_type(end, 0), len(cdelta), -1, base,
+                         link, self.rev(p1), self.rev(p2), node)
                 self.index.append(e)
                 self.nodemap[node] = r
                 dfh.write(cdelta)
-                ifh.write(struct.pack(indexformat, *e))
+                ifh.write(struct.pack(self.indexformat, *e))
 
             t, r, chain, prev = r, r + 1, node, node
             base = self.base(t)
             start = self.start(base)
             end = self.end(t)
 
-        dfh.close()
-        ifh.close()
         if node is None:
             raise RevlogError(_("group to be added is empty"))
         return node
@@ -824,32 +901,34 @@
         if self.count() == 0 or rev >= self.count():
             return
 
+        if isinstance(self.index, lazyindex):
+            self.loadindexmap()
+
         # When stripping away a revision, we need to make sure it
         # does not actually belong to an older changeset.
         # The minlink parameter defines the oldest revision
         # we're allowed to strip away.
-        while minlink > self.index[rev][3]:
+        while minlink > self.index[rev][-4]:
             rev += 1
             if rev >= self.count():
                 return
 
         # first truncate the files on disk
         end = self.start(rev)
-        self.opener(self.datafile, "a").truncate(end)
-        end = rev * struct.calcsize(indexformat)
-        self.opener(self.indexfile, "a").truncate(end)
+        df = self.opener(self.datafile, "a")
+        df.truncate(end)
+        end = rev * struct.calcsize(self.indexformat)
+
+        indexf = self.opener(self.indexfile, "a")
+        indexf.truncate(end)
 
         # then reset internal state in memory to forget those revisions
         self.cache = None
         self.chunkcache = None
-        for p in self.index[rev:]:
-            del self.nodemap[p[6]]
-        del self.index[rev:]
+        for x in xrange(rev, self.count()):
+            del self.nodemap[self.node(x)]
 
-        # truncating the lazyindex also truncates the lazymap.
-        if isinstance(self.index, lazyindex):
-            self.index.trunc(end)
-
+        del self.index[rev:]
 
     def checksize(self):
         expected = 0
@@ -870,7 +949,7 @@
             f = self.opener(self.indexfile)
             f.seek(0, 2)
             actual = f.tell()
-            s = struct.calcsize(indexformat)
+            s = struct.calcsize(self.indexformat)
             i = actual / s
             di = actual - (i * s)
         except IOError, inst:
--- a/mercurial/statichttprepo.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/statichttprepo.py	Tue Apr 04 16:38:43 2006 -0400
@@ -32,6 +32,7 @@
     def __init__(self, ui, path):
         self.path = (path + "/.hg")
         self.ui = ui
+        self.revlogversion = 0
         self.opener = opener(self.path)
         self.manifest = manifest.manifest(self.opener)
         self.changelog = changelog.changelog(self.opener)
--- a/mercurial/ui.py	Mon Apr 03 10:02:09 2006 -0700
+++ b/mercurial/ui.py	Tue Apr 04 16:38:43 2006 -0400
@@ -29,6 +29,7 @@
             self.diffcache = None
             self.header = []
             self.prev_header = []
+            self.revlogopts = self.configrevlog()
         else:
             # parentui may point to an ui object which is already a child
             self.parentui = parentui.parentui or parentui
@@ -134,6 +135,12 @@
                 result.append(path)
         return result
 
+    def configrevlog(self):
+        ret = {}
+        for x in self.configitems("revlog"):
+            k = x[0].lower()
+            ret[k] = x[1]
+        return ret
     def diffopts(self):
         if self.diffcache:
             return self.diffcache