view mercurial/verify.py @ 27015:341cb90ffd18

util: disable floating point stat times (issue4836) Alternate fix for this issue which avoids putting extra function calls and exception handling in the fast path. For almost all purposes, integer timestamps are preferable to Mercurial. It stores integer timestamps in the dirstate and would thus like to avoid doing any float/int comparisons or conversions. We will continue to have to deal with 1-second granularity on filesystems for quite some time, so this won't significantly hinder our capabilities. This has some impact on our file cache validation code in that it lowers timestamp resolution. But as we still have to deal with low-resolution filesystems, we're not relying on this anyway. An alternate approach is to use stat[ST_MTIME], which is guaranteed to be an integer. But since this support isn't already in our extension, we can't depend on it being available without adding a hard Python->C API dependency that's painful for people like yours truly who have bisect regularly and people without compilers.
author Matt Mackall <mpm@selenic.com>
date Thu, 19 Nov 2015 13:21:24 -0600
parents d1c741644d25
children 937e73a6e4ff
line wrap: on
line source

# verify.py - repository integrity checking for Mercurial
#
# Copyright 2006, 2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from __future__ import absolute_import

import os

from .i18n import _
from .node import (
    nullid,
    short,
)

from . import (
    error,
    revlog,
    util,
)

def verify(repo):
    lock = repo.lock()
    try:
        return _verify(repo)
    finally:
        lock.release()

def _normpath(f):
    # under hg < 2.4, convert didn't sanitize paths properly, so a
    # converted repo may contain repeated slashes
    while '//' in f:
        f = f.replace('//', '/')
    return f

def _validpath(repo, path):
    """Returns False if a path should NOT be treated as part of a repo.

    For all in-core cases, this returns True, as we have no way for a
    path to be mentioned in the history but not actually be
    relevant. For narrow clones, this is important because many
    filelogs will be missing, and changelog entries may mention
    modified files that are outside the narrow scope.
    """
    return True

def _verify(repo):
    repo = repo.unfiltered()
    mflinkrevs = {}
    filelinkrevs = {}
    filenodes = {}
    revisions = 0
    badrevs = set()
    errors = [0]
    warnings = [0]
    ui = repo.ui
    cl = repo.changelog
    mf = repo.manifest
    lrugetctx = util.lrucachefunc(repo.changectx)

    if not repo.url().startswith('file:'):
        raise error.Abort(_("cannot verify bundle or remote repos"))

    def err(linkrev, msg, filename=None):
        if linkrev is not None:
            badrevs.add(linkrev)
        else:
            linkrev = '?'
        msg = "%s: %s" % (linkrev, msg)
        if filename:
            msg = "%s@%s" % (filename, msg)
        ui.warn(" " + msg + "\n")
        errors[0] += 1

    def exc(linkrev, msg, inst, filename=None):
        if isinstance(inst, KeyboardInterrupt):
            ui.warn(_("interrupted"))
            raise
        if not str(inst):
            inst = repr(inst)
        err(linkrev, "%s: %s" % (msg, inst), filename)

    def warn(msg):
        ui.warn(msg + "\n")
        warnings[0] += 1

    def checklog(obj, name, linkrev):
        if not len(obj) and (havecl or havemf):
            err(linkrev, _("empty or missing %s") % name)
            return

        d = obj.checksize()
        if d[0]:
            err(None, _("data length off by %d bytes") % d[0], name)
        if d[1]:
            err(None, _("index contains %d extra bytes") % d[1], name)

        if obj.version != revlog.REVLOGV0:
            if not revlogv1:
                warn(_("warning: `%s' uses revlog format 1") % name)
        elif revlogv1:
            warn(_("warning: `%s' uses revlog format 0") % name)

    def checkentry(obj, i, node, seen, linkrevs, f):
        lr = obj.linkrev(obj.rev(node))
        if lr < 0 or (havecl and lr not in linkrevs):
            if lr < 0 or lr >= len(cl):
                msg = _("rev %d points to nonexistent changeset %d")
            else:
                msg = _("rev %d points to unexpected changeset %d")
            err(None, msg % (i, lr), f)
            if linkrevs:
                if f and len(linkrevs) > 1:
                    try:
                        # attempt to filter down to real linkrevs
                        linkrevs = [l for l in linkrevs
                                    if lrugetctx(l)[f].filenode() == node]
                    except Exception:
                        pass
                warn(_(" (expected %s)") % " ".join(map(str, linkrevs)))
            lr = None # can't be trusted

        try:
            p1, p2 = obj.parents(node)
            if p1 not in seen and p1 != nullid:
                err(lr, _("unknown parent 1 %s of %s") %
                    (short(p1), short(node)), f)
            if p2 not in seen and p2 != nullid:
                err(lr, _("unknown parent 2 %s of %s") %
                    (short(p2), short(node)), f)
        except Exception as inst:
            exc(lr, _("checking parents of %s") % short(node), inst, f)

        if node in seen:
            err(lr, _("duplicate revision %d (%d)") % (i, seen[node]), f)
        seen[node] = i
        return lr

    if os.path.exists(repo.sjoin("journal")):
        ui.warn(_("abandoned transaction found - run hg recover\n"))

    revlogv1 = cl.version != revlog.REVLOGV0
    if ui.verbose or not revlogv1:
        ui.status(_("repository uses revlog format %d\n") %
                       (revlogv1 and 1 or 0))

    havecl = len(cl) > 0
    havemf = len(mf) > 0

    ui.status(_("checking changesets\n"))
    refersmf = False
    seen = {}
    checklog(cl, "changelog", 0)
    total = len(repo)
    for i in repo:
        ui.progress(_('checking'), i, total=total, unit=_('changesets'))
        n = cl.node(i)
        checkentry(cl, i, n, seen, [i], "changelog")

        try:
            changes = cl.read(n)
            if changes[0] != nullid:
                mflinkrevs.setdefault(changes[0], []).append(i)
                refersmf = True
            for f in changes[3]:
                if _validpath(repo, f):
                    filelinkrevs.setdefault(_normpath(f), []).append(i)
        except Exception as inst:
            refersmf = True
            exc(i, _("unpacking changeset %s") % short(n), inst)
    ui.progress(_('checking'), None)

    ui.status(_("checking manifests\n"))
    seen = {}
    if refersmf:
        # Do not check manifest if there are only changelog entries with
        # null manifests.
        checklog(mf, "manifest", 0)
    total = len(mf)
    for i in mf:
        ui.progress(_('checking'), i, total=total, unit=_('manifests'))
        n = mf.node(i)
        lr = checkentry(mf, i, n, seen, mflinkrevs.get(n, []), "manifest")
        if n in mflinkrevs:
            del mflinkrevs[n]
        else:
            err(lr, _("%s not in changesets") % short(n), "manifest")

        try:
            for f, fn in mf.readdelta(n).iteritems():
                if not f:
                    err(lr, _("file without name in manifest"))
                elif f != "/dev/null": # ignore this in very old repos
                    if _validpath(repo, f):
                        filenodes.setdefault(
                            _normpath(f), {}).setdefault(fn, lr)
        except Exception as inst:
            exc(lr, _("reading manifest delta %s") % short(n), inst)
    ui.progress(_('checking'), None)

    ui.status(_("crosschecking files in changesets and manifests\n"))

    total = len(mflinkrevs) + len(filelinkrevs) + len(filenodes)
    count = 0
    if havemf:
        for c, m in sorted([(c, m) for m in mflinkrevs
                            for c in mflinkrevs[m]]):
            count += 1
            if m == nullid:
                continue
            ui.progress(_('crosschecking'), count, total=total)
            err(c, _("changeset refers to unknown manifest %s") % short(m))
        mflinkrevs = None # del is bad here due to scope issues

        for f in sorted(filelinkrevs):
            count += 1
            ui.progress(_('crosschecking'), count, total=total)
            if f not in filenodes:
                lr = filelinkrevs[f][0]
                err(lr, _("in changeset but not in manifest"), f)

    if havecl:
        for f in sorted(filenodes):
            count += 1
            ui.progress(_('crosschecking'), count, total=total)
            if f not in filelinkrevs:
                try:
                    fl = repo.file(f)
                    lr = min([fl.linkrev(fl.rev(n)) for n in filenodes[f]])
                except Exception:
                    lr = None
                err(lr, _("in manifest but not in changeset"), f)

    ui.progress(_('crosschecking'), None)

    ui.status(_("checking files\n"))

    storefiles = set()
    for f, f2, size in repo.store.datafiles():
        if not f:
            err(None, _("cannot decode filename '%s'") % f2)
        elif size > 0 or not revlogv1:
            storefiles.add(_normpath(f))

    fncachewarned = False
    files = sorted(set(filenodes) | set(filelinkrevs))
    total = len(files)
    for i, f in enumerate(files):
        ui.progress(_('checking'), i, item=f, total=total)
        try:
            linkrevs = filelinkrevs[f]
        except KeyError:
            # in manifest but not in changelog
            linkrevs = []

        if linkrevs:
            lr = linkrevs[0]
        else:
            lr = None

        try:
            fl = repo.file(f)
        except error.RevlogError as e:
            err(lr, _("broken revlog! (%s)") % e, f)
            continue

        for ff in fl.files():
            try:
                storefiles.remove(ff)
            except KeyError:
                warn(_(" warning: revlog '%s' not in fncache!") % ff)
                fncachewarned = True

        checklog(fl, f, lr)
        seen = {}
        rp = None
        for i in fl:
            revisions += 1
            n = fl.node(i)
            lr = checkentry(fl, i, n, seen, linkrevs, f)
            if f in filenodes:
                if havemf and n not in filenodes[f]:
                    err(lr, _("%s not in manifests") % (short(n)), f)
                else:
                    del filenodes[f][n]

            # verify contents
            try:
                l = len(fl.read(n))
                rp = fl.renamed(n)
                if l != fl.size(i):
                    if len(fl.revision(n)) != fl.size(i):
                        err(lr, _("unpacked size is %s, %s expected") %
                            (l, fl.size(i)), f)
            except error.CensoredNodeError:
                # experimental config: censor.policy
                if ui.config("censor", "policy", "abort") == "abort":
                    err(lr, _("censored file data"), f)
            except Exception as inst:
                exc(lr, _("unpacking %s") % short(n), inst, f)

            # check renames
            try:
                if rp:
                    if lr is not None and ui.verbose:
                        ctx = lrugetctx(lr)
                        found = False
                        for pctx in ctx.parents():
                            if rp[0] in pctx:
                                found = True
                                break
                        if not found:
                            warn(_("warning: copy source of '%s' not"
                                   " in parents of %s") % (f, ctx))
                    fl2 = repo.file(rp[0])
                    if not len(fl2):
                        err(lr, _("empty or missing copy source revlog %s:%s")
                            % (rp[0], short(rp[1])), f)
                    elif rp[1] == nullid:
                        ui.note(_("warning: %s@%s: copy source"
                                  " revision is nullid %s:%s\n")
                            % (f, lr, rp[0], short(rp[1])))
                    else:
                        fl2.rev(rp[1])
            except Exception as inst:
                exc(lr, _("checking rename of %s") % short(n), inst, f)

        # cross-check
        if f in filenodes:
            fns = [(lr, n) for n, lr in filenodes[f].iteritems()]
            for lr, node in sorted(fns):
                err(lr, _("%s in manifests not found") % short(node), f)
    ui.progress(_('checking'), None)

    for f in storefiles:
        warn(_("warning: orphan revlog '%s'") % f)

    ui.status(_("%d files, %d changesets, %d total revisions\n") %
                   (len(files), len(cl), revisions))
    if warnings[0]:
        ui.warn(_("%d warnings encountered!\n") % warnings[0])
    if fncachewarned:
        ui.warn(_('hint: run "hg debugrebuildfncache" to recover from '
                  'corrupt fncache\n'))
    if errors[0]:
        ui.warn(_("%d integrity errors encountered!\n") % errors[0])
        if badrevs:
            ui.warn(_("(first damaged changeset appears to be %d)\n")
                    % min(badrevs))
        return 1