diff tests/test-remotefilelog-datapack.py @ 40495:3a333a582d7b

remotefilelog: import pruned-down remotefilelog extension from hg-experimental This is remotefilelog as of my recent patches for compatibility with current tip of hg, minus support for old versions of Mercurial and some FB-specific features like their treemanifest extension and fetching linkrev data from a patched phabricator. The file extutil.py moved from hgext3rd to remotefilelog. This is not yet ready to be landed, consider it a preview for now. Planned changes include: * replace lz4 with zstd * rename some capabilities, requirements and wireproto commands to mark them as experimental * consolidate bits of shallowutil with related functions (eg readfile) I'm certainly open to other (small) changes, but my rough mission is to land this largely as-is so we can use it as a model of the functionality we need going forward for lazy-fetching of file contents from a server. # no-check-commit because of a few foo_bar functions Differential Revision: https://phab.mercurial-scm.org/D4782
author Augie Fackler <augie@google.com>
date Thu, 27 Sep 2018 13:03:19 -0400
parents
children 10c10da14c5d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-remotefilelog-datapack.py	Thu Sep 27 13:03:19 2018 -0400
@@ -0,0 +1,388 @@
+#!/usr/bin/env python
+from __future__ import absolute_import, print_function
+
+import hashlib
+import os
+import random
+import shutil
+import stat
+import struct
+import sys
+import tempfile
+import time
+import unittest
+
+import silenttestrunner
+
+# Load the local remotefilelog, not the system one
+sys.path[0:0] = [os.path.join(os.path.dirname(__file__), '..')]
+from mercurial.node import nullid
+from mercurial import (
+    ui as uimod,
+)
+from hgext.remotefilelog import (
+    basepack,
+    constants,
+    datapack,
+)
+
+class datapacktestsbase(object):
+    def __init__(self, datapackreader, paramsavailable):
+        self.datapackreader = datapackreader
+        self.paramsavailable = paramsavailable
+
+    def setUp(self):
+        self.tempdirs = []
+
+    def tearDown(self):
+        for d in self.tempdirs:
+            shutil.rmtree(d)
+
+    def makeTempDir(self):
+        tempdir = tempfile.mkdtemp()
+        self.tempdirs.append(tempdir)
+        return tempdir
+
+    def getHash(self, content):
+        return hashlib.sha1(content).digest()
+
+    def getFakeHash(self):
+        return ''.join(chr(random.randint(0, 255)) for _ in range(20))
+
+    def createPack(self, revisions=None, packdir=None, version=0):
+        if revisions is None:
+            revisions = [("filename", self.getFakeHash(), nullid, "content")]
+
+        if packdir is None:
+            packdir = self.makeTempDir()
+
+        packer = datapack.mutabledatapack(
+            uimod.ui(), packdir, version=version)
+
+        for args in revisions:
+            filename, node, base, content = args[0:4]
+            # meta is optional
+            meta = None
+            if len(args) > 4:
+                meta = args[4]
+            packer.add(filename, node, base, content, metadata=meta)
+
+        path = packer.close()
+        return self.datapackreader(path)
+
+    def _testAddSingle(self, content):
+        """Test putting a simple blob into a pack and reading it out.
+        """
+        filename = "foo"
+        node = self.getHash(content)
+
+        revisions = [(filename, node, nullid, content)]
+        pack = self.createPack(revisions)
+        if self.paramsavailable:
+            self.assertEquals(pack.params.fanoutprefix,
+                              basepack.SMALLFANOUTPREFIX)
+
+        chain = pack.getdeltachain(filename, node)
+        self.assertEquals(content, chain[0][4])
+
+    def testAddSingle(self):
+        self._testAddSingle('')
+
+    def testAddSingleEmpty(self):
+        self._testAddSingle('abcdef')
+
+    def testAddMultiple(self):
+        """Test putting multiple unrelated blobs into a pack and reading them
+        out.
+        """
+        revisions = []
+        for i in range(10):
+            filename = "foo%s" % i
+            content = "abcdef%s" % i
+            node = self.getHash(content)
+            revisions.append((filename, node, self.getFakeHash(), content))
+
+        pack = self.createPack(revisions)
+
+        for filename, node, base, content in revisions:
+            entry = pack.getdelta(filename, node)
+            self.assertEquals((content, filename, base, {}), entry)
+
+            chain = pack.getdeltachain(filename, node)
+            self.assertEquals(content, chain[0][4])
+
+    def testAddDeltas(self):
+        """Test putting multiple delta blobs into a pack and read the chain.
+        """
+        revisions = []
+        filename = "foo"
+        lastnode = nullid
+        for i in range(10):
+            content = "abcdef%s" % i
+            node = self.getHash(content)
+            revisions.append((filename, node, lastnode, content))
+            lastnode = node
+
+        pack = self.createPack(revisions)
+
+        entry = pack.getdelta(filename, revisions[0][1])
+        realvalue = (revisions[0][3], filename, revisions[0][2], {})
+        self.assertEquals(entry, realvalue)
+
+        # Test that the chain for the final entry has all the others
+        chain = pack.getdeltachain(filename, node)
+        for i in range(10):
+            content = "abcdef%s" % i
+            self.assertEquals(content, chain[-i - 1][4])
+
+    def testPackMany(self):
+        """Pack many related and unrelated objects.
+        """
+        # Build a random pack file
+        revisions = []
+        blobs = {}
+        random.seed(0)
+        for i in range(100):
+            filename = "filename-%s" % i
+            filerevs = []
+            for j in range(random.randint(1, 100)):
+                content = "content-%s" % j
+                node = self.getHash(content)
+                lastnode = nullid
+                if len(filerevs) > 0:
+                    lastnode = filerevs[random.randint(0, len(filerevs) - 1)]
+                filerevs.append(node)
+                blobs[(filename, node, lastnode)] = content
+                revisions.append((filename, node, lastnode, content))
+
+        pack = self.createPack(revisions)
+
+        # Verify the pack contents
+        for (filename, node, lastnode), content in sorted(blobs.iteritems()):
+            chain = pack.getdeltachain(filename, node)
+            for entry in chain:
+                expectedcontent = blobs[(entry[0], entry[1], entry[3])]
+                self.assertEquals(entry[4], expectedcontent)
+
+    def testPackMetadata(self):
+        revisions = []
+        for i in range(100):
+            filename = '%s.txt' % i
+            content = 'put-something-here \n' * i
+            node = self.getHash(content)
+            meta = {constants.METAKEYFLAG: i ** 4,
+                    constants.METAKEYSIZE: len(content),
+                    'Z': 'random_string',
+                    '_': '\0' * i}
+            revisions.append((filename, node, nullid, content, meta))
+        pack = self.createPack(revisions, version=1)
+        for name, node, x, content, origmeta in revisions:
+            parsedmeta = pack.getmeta(name, node)
+            # flag == 0 should be optimized out
+            if origmeta[constants.METAKEYFLAG] == 0:
+                del origmeta[constants.METAKEYFLAG]
+            self.assertEquals(parsedmeta, origmeta)
+
+    def testPackMetadataThrows(self):
+        filename = '1'
+        content = '2'
+        node = self.getHash(content)
+        meta = {constants.METAKEYFLAG: 3}
+        revisions = [(filename, node, nullid, content, meta)]
+        try:
+            self.createPack(revisions, version=0)
+            self.assertTrue(False, "should throw if metadata is not supported")
+        except RuntimeError:
+            pass
+
+    def testGetMissing(self):
+        """Test the getmissing() api.
+        """
+        revisions = []
+        filename = "foo"
+        lastnode = nullid
+        for i in range(10):
+            content = "abcdef%s" % i
+            node = self.getHash(content)
+            revisions.append((filename, node, lastnode, content))
+            lastnode = node
+
+        pack = self.createPack(revisions)
+
+        missing = pack.getmissing([("foo", revisions[0][1])])
+        self.assertFalse(missing)
+
+        missing = pack.getmissing([("foo", revisions[0][1]),
+                                   ("foo", revisions[1][1])])
+        self.assertFalse(missing)
+
+        fakenode = self.getFakeHash()
+        missing = pack.getmissing([("foo", revisions[0][1]), ("foo", fakenode)])
+        self.assertEquals(missing, [("foo", fakenode)])
+
+    def testAddThrows(self):
+        pack = self.createPack()
+
+        try:
+            pack.add('filename', nullid, 'contents')
+            self.assertTrue(False, "datapack.add should throw")
+        except RuntimeError:
+            pass
+
+    def testBadVersionThrows(self):
+        pack = self.createPack()
+        path = pack.path + '.datapack'
+        with open(path) as f:
+            raw = f.read()
+        raw = struct.pack('!B', 255) + raw[1:]
+        os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
+        with open(path, 'w+') as f:
+            f.write(raw)
+
+        try:
+            pack = self.datapackreader(pack.path)
+            self.assertTrue(False, "bad version number should have thrown")
+        except RuntimeError:
+            pass
+
+    def testMissingDeltabase(self):
+        fakenode = self.getFakeHash()
+        revisions = [("filename", fakenode, self.getFakeHash(), "content")]
+        pack = self.createPack(revisions)
+        chain = pack.getdeltachain("filename", fakenode)
+        self.assertEquals(len(chain), 1)
+
+    def testLargePack(self):
+        """Test creating and reading from a large pack with over X entries.
+        This causes it to use a 2^16 fanout table instead."""
+        revisions = []
+        blobs = {}
+        total = basepack.SMALLFANOUTCUTOFF + 1
+        for i in xrange(total):
+            filename = "filename-%s" % i
+            content = filename
+            node = self.getHash(content)
+            blobs[(filename, node)] = content
+            revisions.append((filename, node, nullid, content))
+
+        pack = self.createPack(revisions)
+        if self.paramsavailable:
+            self.assertEquals(pack.params.fanoutprefix,
+                              basepack.LARGEFANOUTPREFIX)
+
+        for (filename, node), content in blobs.iteritems():
+            actualcontent = pack.getdeltachain(filename, node)[0][4]
+            self.assertEquals(actualcontent, content)
+
+    def testPacksCache(self):
+        """Test that we remember the most recent packs while fetching the delta
+        chain."""
+
+        packdir = self.makeTempDir()
+        deltachains = []
+
+        numpacks = 10
+        revisionsperpack = 100
+
+        for i in range(numpacks):
+            chain = []
+            revision = (str(i), self.getFakeHash(), nullid, "content")
+
+            for _ in range(revisionsperpack):
+                chain.append(revision)
+                revision = (
+                    str(i),
+                    self.getFakeHash(),
+                    revision[1],
+                    self.getFakeHash()
+                )
+
+            self.createPack(chain, packdir)
+            deltachains.append(chain)
+
+        class testdatapackstore(datapack.datapackstore):
+            # Ensures that we are not keeping everything in the cache.
+            DEFAULTCACHESIZE = numpacks / 2
+
+        store = testdatapackstore(uimod.ui(), packdir)
+
+        random.shuffle(deltachains)
+        for randomchain in deltachains:
+            revision = random.choice(randomchain)
+            chain = store.getdeltachain(revision[0], revision[1])
+
+            mostrecentpack = next(iter(store.packs), None)
+            self.assertEquals(
+                mostrecentpack.getdeltachain(revision[0], revision[1]),
+                chain
+            )
+
+            self.assertEquals(randomchain.index(revision) + 1, len(chain))
+
+    # perf test off by default since it's slow
+    def _testIndexPerf(self):
+        random.seed(0)
+        print("Multi-get perf test")
+        packsizes = [
+            100,
+            10000,
+            100000,
+            500000,
+            1000000,
+            3000000,
+        ]
+        lookupsizes = [
+            10,
+            100,
+            1000,
+            10000,
+            100000,
+            1000000,
+        ]
+        for packsize in packsizes:
+            revisions = []
+            for i in xrange(packsize):
+                filename = "filename-%s" % i
+                content = "content-%s" % i
+                node = self.getHash(content)
+                revisions.append((filename, node, nullid, content))
+
+            path = self.createPack(revisions).path
+
+            # Perf of large multi-get
+            import gc
+            gc.disable()
+            pack = self.datapackreader(path)
+            for lookupsize in lookupsizes:
+                if lookupsize > packsize:
+                    continue
+                random.shuffle(revisions)
+                findnodes = [(rev[0], rev[1]) for rev in revisions]
+
+                start = time.time()
+                pack.getmissing(findnodes[:lookupsize])
+                elapsed = time.time() - start
+                print ("%s pack %s lookups = %0.04f" %
+                       (('%s' % packsize).rjust(7),
+                        ('%s' % lookupsize).rjust(7),
+                        elapsed))
+
+            print("")
+            gc.enable()
+
+        # The perf test is meant to produce output, so we always fail the test
+        # so the user sees the output.
+        raise RuntimeError("perf test always fails")
+
+class datapacktests(datapacktestsbase, unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        datapacktestsbase.__init__(self, datapack.datapack, True)
+        unittest.TestCase.__init__(self, *args, **kwargs)
+
+# TODO:
+# datapack store:
+# - getmissing
+# - GC two packs into one
+
+if __name__ == '__main__':
+    silenttestrunner.main(__name__)