diff mercurial/revlogutils/rewrite.py @ 47815:b30a53ffbf9b stable

debugcommands: introduce a debug command to repair repos affected by issue6528 This command is quite basic and slow, it will loop over the entirety of the filelogs in the repository and check each revision for corruption, then fixes the affected filelogs. It takes under 25 minutes for Mozilla-Central on my not-top-of-the-line laptop, using the `--to-report` and `--from-report` options will make this pretty tolerable to use, I think. This change also introduces a test for the fix. Differential Revision: https://phab.mercurial-scm.org/D11239
author Raphaël Gomès <rgomes@octobus.net>
date Tue, 27 Jul 2021 21:45:27 +0200
parents 5045ba2a3afd
children 32e21ac3adb1
line wrap: on
line diff
--- a/mercurial/revlogutils/rewrite.py	Mon Aug 09 19:49:57 2021 -0400
+++ b/mercurial/revlogutils/rewrite.py	Tue Jul 27 21:45:27 2021 +0200
@@ -7,6 +7,7 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
+import binascii
 import contextlib
 import os
 
@@ -472,3 +473,224 @@
     new_index_file.write(entry_bin)
     revlog._docket.index_end = new_index_file.tell()
     revlog._docket.data_end = new_data_file.tell()
+
+
+def _get_filename_from_filelog_index(path):
+    # Drop the extension and the `data/` prefix
+    path_part = path.rsplit(b'.', 1)[0].split(b'/', 1)
+    if len(path_part) < 2:
+        msg = _(b"cannot recognize filelog from filename: '%s'")
+        msg %= path
+        raise error.Abort(msg)
+
+    return path_part[1]
+
+
+def _filelog_from_filename(repo, path):
+    """Returns the filelog for the given `path`. Stolen from `engine.py`"""
+
+    from .. import filelog  # avoid cycle
+
+    fl = filelog.filelog(repo.svfs, path)
+    return fl
+
+
+def _write_swapped_parents(repo, rl, rev, offset, fp):
+    """Swaps p1 and p2 and overwrites the revlog entry for `rev` in `fp`"""
+    from ..pure import parsers  # avoid cycle
+
+    if repo._currentlock(repo._lockref) is None:
+        # Let's be paranoid about it
+        msg = "repo needs to be locked to rewrite parents"
+        raise error.ProgrammingError(msg)
+
+    index_format = parsers.IndexObject.index_format
+    entry = rl.index[rev]
+    new_entry = list(entry)
+    new_entry[5], new_entry[6] = entry[6], entry[5]
+    packed = index_format.pack(*new_entry[:8])
+    fp.seek(offset)
+    fp.write(packed)
+
+
+def _reorder_filelog_parents(repo, fl, to_fix):
+    """
+    Swaps p1 and p2 for all `to_fix` revisions of filelog `fl` and writes the
+    new version to disk, overwriting the old one with a rename.
+    """
+    from ..pure import parsers  # avoid cycle
+
+    ui = repo.ui
+    assert len(to_fix) > 0
+    rl = fl._revlog
+    if rl._format_version != constants.REVLOGV1:
+        msg = "expected version 1 revlog, got version '%d'" % rl._format_version
+        raise error.ProgrammingError(msg)
+
+    index_file = rl._indexfile
+    new_file_path = index_file + b'.tmp-parents-fix'
+    repaired_msg = _(b"repaired revision %d of 'filelog %s'\n")
+
+    with ui.uninterruptible():
+        try:
+            util.copyfile(
+                rl.opener.join(index_file),
+                rl.opener.join(new_file_path),
+                checkambig=rl._checkambig,
+            )
+
+            with rl.opener(new_file_path, mode=b"r+") as fp:
+                if rl._inline:
+                    index = parsers.InlinedIndexObject(fp.read())
+                    for rev in fl.revs():
+                        if rev in to_fix:
+                            offset = index._calculate_index(rev)
+                            _write_swapped_parents(repo, rl, rev, offset, fp)
+                            ui.write(repaired_msg % (rev, index_file))
+                else:
+                    index_format = parsers.IndexObject.index_format
+                    for rev in to_fix:
+                        offset = rev * index_format.size
+                        _write_swapped_parents(repo, rl, rev, offset, fp)
+                        ui.write(repaired_msg % (rev, index_file))
+
+            rl.opener.rename(new_file_path, index_file)
+            rl.clearcaches()
+            rl._loadindex()
+        finally:
+            util.tryunlink(new_file_path)
+
+
+def _is_revision_affected(ui, fl, filerev, path):
+    """Mercurial currently (5.9rc0) uses `p1 == nullrev and p2 != nullrev` as a
+    special meaning compared to the reverse in the context of filelog-based
+    copytracing. issue6528 exists because new code assumed that parent ordering
+    didn't matter, so this detects if the revision contains metadata (since
+    it's only used for filelog-based copytracing) and its parents are in the
+    "wrong" order."""
+    try:
+        raw_text = fl.rawdata(filerev)
+    except error.CensoredNodeError:
+        # We don't care about censored nodes as they never carry metadata
+        return False
+    has_meta = raw_text.startswith(b'\x01\n')
+    if has_meta:
+        (p1, p2) = fl.parentrevs(filerev)
+        if p1 != nullrev and p2 == nullrev:
+            return True
+    return False
+
+
+def _from_report(ui, repo, context, from_report, dry_run):
+    """
+    Fix the revisions given in the `from_report` file, but still checks if the
+    revisions are indeed affected to prevent an unfortunate cyclic situation
+    where we'd swap well-ordered parents again.
+
+    See the doc for `debug_fix_issue6528` for the format documentation.
+    """
+    ui.write(_(b"loading report file '%s'\n") % from_report)
+
+    with context(), open(from_report, mode='rb') as f:
+        for line in f.read().split(b'\n'):
+            if not line:
+                continue
+            filenodes, filename = line.split(b' ', 1)
+            fl = _filelog_from_filename(repo, filename)
+            to_fix = set(
+                fl.rev(binascii.unhexlify(n)) for n in filenodes.split(b',')
+            )
+            excluded = set()
+
+            for filerev in to_fix:
+                if _is_revision_affected(ui, fl, filerev, filename):
+                    msg = b"found affected revision %d for filelog '%s'\n"
+                    ui.warn(msg % (filerev, filename))
+                else:
+                    msg = _(b"revision %s of file '%s' is not affected\n")
+                    msg %= (binascii.hexlify(fl.node(filerev)), filename)
+                    ui.warn(msg)
+                    excluded.add(filerev)
+
+            to_fix = to_fix - excluded
+            if not to_fix:
+                msg = _(b"no affected revisions were found for '%s'\n")
+                ui.write(msg % filename)
+                continue
+            if not dry_run:
+                _reorder_filelog_parents(repo, fl, sorted(to_fix))
+
+
+def repair_issue6528(ui, repo, dry_run=False, to_report=None, from_report=None):
+    from .. import store  # avoid cycle
+
+    @contextlib.contextmanager
+    def context():
+        if dry_run or to_report:  # No need for locking
+            yield
+        else:
+            with repo.wlock(), repo.lock():
+                yield
+
+    if from_report:
+        return _from_report(ui, repo, context, from_report, dry_run)
+
+    report_entries = []
+
+    with context():
+        files = list(
+            (file_type, path)
+            for (file_type, path, _e, _s) in repo.store.datafiles()
+            if path.endswith(b'.i') and file_type & store.FILEFLAGS_FILELOG
+        )
+
+        progress = ui.makeprogress(
+            _(b"looking for affected revisions"),
+            unit=_(b"filelogs"),
+            total=len(files),
+        )
+        found_nothing = True
+
+        for file_type, path in files:
+            if (
+                not path.endswith(b'.i')
+                or not file_type & store.FILEFLAGS_FILELOG
+            ):
+                continue
+            progress.increment()
+            filename = _get_filename_from_filelog_index(path)
+            fl = _filelog_from_filename(repo, filename)
+
+            # Set of filerevs (or hex filenodes if `to_report`) that need fixing
+            to_fix = set()
+            for filerev in fl.revs():
+                # TODO speed up by looking at the start of the delta
+                # If it hasn't changed, it's not worth looking at the other revs
+                # in the same chain
+                affected = _is_revision_affected(ui, fl, filerev, path)
+                if affected:
+                    msg = b"found affected revision %d for filelog '%s'\n"
+                    ui.warn(msg % (filerev, path))
+                    found_nothing = False
+                    if not dry_run:
+                        if to_report:
+                            to_fix.add(binascii.hexlify(fl.node(filerev)))
+                        else:
+                            to_fix.add(filerev)
+
+            if to_fix:
+                to_fix = sorted(to_fix)
+                if to_report:
+                    report_entries.append((filename, to_fix))
+                else:
+                    _reorder_filelog_parents(repo, fl, to_fix)
+
+        if found_nothing:
+            ui.write(_(b"no affected revisions were found\n"))
+
+        if to_report and report_entries:
+            with open(to_report, mode="wb") as f:
+                for path, to_fix in report_entries:
+                    f.write(b"%s %s\n" % (b",".join(to_fix), path))
+
+        progress.complete()