view mercurial/revlogutils/debug.py @ 49677:05db41701ece

find-delta: pass the cache-delta usage policy alongside the cache-delta The idea is to give higher level code more control to what will happens with the cache delta passed. This should help with controling how we treat delta's from different sources. The final goal of this change is to allow for server modes where the client can blindly accept any server delta without regards to any local constraints. This will be implemented in later changesets.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Mon, 07 Nov 2022 22:12:59 -0500
parents 4302db0f54c8
children b1e4c74beb6f
line wrap: on
line source

# revlogutils/debug.py - utility used for revlog debuging
#
# Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
# Copyright 2022 Octobus <contact@octobus.net>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import collections
import string

from .. import (
    mdiff,
    node as nodemod,
    revlogutils,
    util,
)

from . import (
    constants,
    deltas as deltautil,
)

INDEX_ENTRY_DEBUG_COLUMN = []

NODE_SIZE = object()


class _column_base:
    """constains the definition of a revlog column

    name:         the column header,
    value_func:   the function called to get a value,
    size:         the width of the column,
    verbose_only: only include the column in verbose mode.
    """

    def __init__(self, name, value_func, size=None, verbose=False):
        self.name = name
        self.value_func = value_func
        if size is not NODE_SIZE:
            if size is None:
                size = 8  # arbitrary default
            size = max(len(name), size)
        self._size = size
        self.verbose_only = verbose

    def get_size(self, node_size):
        if self._size is NODE_SIZE:
            return node_size
        else:
            return self._size


def debug_column(name, size=None, verbose=False):
    """decorated function is registered as a column

    name: the name of the column,
    size: the expected size of the column.
    """

    def register(func):
        entry = _column_base(
            name=name,
            value_func=func,
            size=size,
            verbose=verbose,
        )
        INDEX_ENTRY_DEBUG_COLUMN.append(entry)
        return entry

    return register


@debug_column(b"rev", size=6)
def _rev(index, rev, entry, hexfn):
    return b"%d" % rev


@debug_column(b"rank", size=6, verbose=True)
def rank(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_RANK]


@debug_column(b"linkrev", size=6)
def _linkrev(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_LINK_REV]


@debug_column(b"nodeid", size=NODE_SIZE)
def _nodeid(index, rev, entry, hexfn):
    return hexfn(entry[constants.ENTRY_NODE_ID])


@debug_column(b"p1-rev", size=6, verbose=True)
def _p1_rev(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_PARENT_1]


@debug_column(b"p1-nodeid", size=NODE_SIZE)
def _p1_node(index, rev, entry, hexfn):
    parent = entry[constants.ENTRY_PARENT_1]
    p_entry = index[parent]
    return hexfn(p_entry[constants.ENTRY_NODE_ID])


@debug_column(b"p2-rev", size=6, verbose=True)
def _p2_rev(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_PARENT_2]


@debug_column(b"p2-nodeid", size=NODE_SIZE)
def _p2_node(index, rev, entry, hexfn):
    parent = entry[constants.ENTRY_PARENT_2]
    p_entry = index[parent]
    return hexfn(p_entry[constants.ENTRY_NODE_ID])


@debug_column(b"full-size", size=20, verbose=True)
def full_size(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_DATA_UNCOMPRESSED_LENGTH]


@debug_column(b"delta-base", size=6, verbose=True)
def delta_base(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_DELTA_BASE]


@debug_column(b"flags", size=2, verbose=True)
def flags(index, rev, entry, hexfn):
    field = entry[constants.ENTRY_DATA_OFFSET]
    field &= 0xFFFF
    return b"%d" % field


@debug_column(b"comp-mode", size=4, verbose=True)
def compression_mode(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_DATA_COMPRESSION_MODE]


@debug_column(b"data-offset", size=20, verbose=True)
def data_offset(index, rev, entry, hexfn):
    field = entry[constants.ENTRY_DATA_OFFSET]
    field >>= 16
    return b"%d" % field


@debug_column(b"chunk-size", size=10, verbose=True)
def data_chunk_size(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_DATA_COMPRESSED_LENGTH]


@debug_column(b"sd-comp-mode", size=7, verbose=True)
def sidedata_compression_mode(index, rev, entry, hexfn):
    compression = entry[constants.ENTRY_SIDEDATA_COMPRESSION_MODE]
    if compression == constants.COMP_MODE_PLAIN:
        return b"plain"
    elif compression == constants.COMP_MODE_DEFAULT:
        return b"default"
    elif compression == constants.COMP_MODE_INLINE:
        return b"inline"
    else:
        return b"%d" % compression


@debug_column(b"sidedata-offset", size=20, verbose=True)
def sidedata_offset(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_SIDEDATA_OFFSET]


@debug_column(b"sd-chunk-size", size=10, verbose=True)
def sidedata_chunk_size(index, rev, entry, hexfn):
    return b"%d" % entry[constants.ENTRY_SIDEDATA_COMPRESSED_LENGTH]


def debug_index(
    ui,
    repo,
    formatter,
    revlog,
    full_node,
):
    """display index data for a revlog"""
    if full_node:
        hexfn = nodemod.hex
    else:
        hexfn = nodemod.short

    idlen = 12
    for i in revlog:
        idlen = len(hexfn(revlog.node(i)))
        break

    fm = formatter

    header_pieces = []
    for column in INDEX_ENTRY_DEBUG_COLUMN:
        if column.verbose_only and not ui.verbose:
            continue
        size = column.get_size(idlen)
        name = column.name
        header_pieces.append(name.rjust(size))

    fm.plain(b' '.join(header_pieces) + b'\n')

    index = revlog.index

    for rev in revlog:
        fm.startitem()
        entry = index[rev]
        first = True
        for column in INDEX_ENTRY_DEBUG_COLUMN:
            if column.verbose_only and not ui.verbose:
                continue
            if not first:
                fm.plain(b' ')
            first = False

            size = column.get_size(idlen)
            value = column.value_func(index, rev, entry, hexfn)
            display = b"%%%ds" % size
            fm.write(column.name, display, value)
        fm.plain(b'\n')

    fm.end()


def dump(ui, revlog):
    """perform the work for `hg debugrevlog --dump"""
    # XXX seems redundant with debug index ?
    r = revlog
    numrevs = len(r)
    ui.write(
        (
            b"# rev p1rev p2rev start   end deltastart base   p1   p2"
            b" rawsize totalsize compression heads chainlen\n"
        )
    )
    ts = 0
    heads = set()

    for rev in range(numrevs):
        dbase = r.deltaparent(rev)
        if dbase == -1:
            dbase = rev
        cbase = r.chainbase(rev)
        clen = r.chainlen(rev)
        p1, p2 = r.parentrevs(rev)
        rs = r.rawsize(rev)
        ts = ts + rs
        heads -= set(r.parentrevs(rev))
        heads.add(rev)
        try:
            compression = ts / r.end(rev)
        except ZeroDivisionError:
            compression = 0
        ui.write(
            b"%5d %5d %5d %5d %5d %10d %4d %4d %4d %7d %9d "
            b"%11d %5d %8d\n"
            % (
                rev,
                p1,
                p2,
                r.start(rev),
                r.end(rev),
                r.start(dbase),
                r.start(cbase),
                r.start(p1),
                r.start(p2),
                rs,
                ts,
                compression,
                len(heads),
                clen,
            )
        )


def debug_revlog(ui, revlog):
    """code for `hg debugrevlog`"""
    r = revlog
    format = r._format_version
    v = r._format_flags
    flags = []
    gdelta = False
    if v & constants.FLAG_INLINE_DATA:
        flags.append(b'inline')
    if v & constants.FLAG_GENERALDELTA:
        gdelta = True
        flags.append(b'generaldelta')
    if not flags:
        flags = [b'(none)']

    ### the total size of stored content if incompressed.
    full_text_total_size = 0
    ### tracks merge vs single parent
    nummerges = 0

    ### tracks ways the "delta" are build
    # nodelta
    numempty = 0
    numemptytext = 0
    numemptydelta = 0
    # full file content
    numfull = 0
    # intermediate snapshot against a prior snapshot
    numsemi = 0
    # snapshot count per depth
    numsnapdepth = collections.defaultdict(lambda: 0)
    # number of snapshots with a non-ancestor delta
    numsnapdepth_nad = collections.defaultdict(lambda: 0)
    # delta against previous revision
    numprev = 0
    # delta against prev, where prev is a non-ancestor
    numprev_nad = 0
    # delta against first or second parent (not prev)
    nump1 = 0
    nump2 = 0
    # delta against neither prev nor parents
    numother = 0
    # delta against other that is a non-ancestor
    numother_nad = 0
    # delta against prev that are also first or second parent
    # (details of `numprev`)
    nump1prev = 0
    nump2prev = 0

    # data about delta chain of each revs
    chainlengths = []
    chainbases = []
    chainspans = []

    # data about each revision
    datasize = [None, 0, 0]
    fullsize = [None, 0, 0]
    semisize = [None, 0, 0]
    # snapshot count per depth
    snapsizedepth = collections.defaultdict(lambda: [None, 0, 0])
    deltasize = [None, 0, 0]
    chunktypecounts = {}
    chunktypesizes = {}

    def addsize(size, l):
        if l[0] is None or size < l[0]:
            l[0] = size
        if size > l[1]:
            l[1] = size
        l[2] += size

    numrevs = len(r)
    for rev in range(numrevs):
        p1, p2 = r.parentrevs(rev)
        delta = r.deltaparent(rev)
        if format > 0:
            s = r.rawsize(rev)
            full_text_total_size += s
            addsize(s, datasize)
        if p2 != nodemod.nullrev:
            nummerges += 1
        size = r.length(rev)
        if delta == nodemod.nullrev:
            chainlengths.append(0)
            chainbases.append(r.start(rev))
            chainspans.append(size)
            if size == 0:
                numempty += 1
                numemptytext += 1
            else:
                numfull += 1
                numsnapdepth[0] += 1
                addsize(size, fullsize)
                addsize(size, snapsizedepth[0])
        else:
            nad = (
                delta != p1 and delta != p2 and not r.isancestorrev(delta, rev)
            )
            chainlengths.append(chainlengths[delta] + 1)
            baseaddr = chainbases[delta]
            revaddr = r.start(rev)
            chainbases.append(baseaddr)
            chainspans.append((revaddr - baseaddr) + size)
            if size == 0:
                numempty += 1
                numemptydelta += 1
            elif r.issnapshot(rev):
                addsize(size, semisize)
                numsemi += 1
                depth = r.snapshotdepth(rev)
                numsnapdepth[depth] += 1
                if nad:
                    numsnapdepth_nad[depth] += 1
                addsize(size, snapsizedepth[depth])
            else:
                addsize(size, deltasize)
                if delta == rev - 1:
                    numprev += 1
                    if delta == p1:
                        nump1prev += 1
                    elif delta == p2:
                        nump2prev += 1
                    elif nad:
                        numprev_nad += 1
                elif delta == p1:
                    nump1 += 1
                elif delta == p2:
                    nump2 += 1
                elif delta != nodemod.nullrev:
                    numother += 1
                    numother_nad += 1

        # Obtain data on the raw chunks in the revlog.
        if util.safehasattr(r, '_getsegmentforrevs'):
            segment = r._getsegmentforrevs(rev, rev)[1]
        else:
            segment = r._revlog._getsegmentforrevs(rev, rev)[1]
        if segment:
            chunktype = bytes(segment[0:1])
        else:
            chunktype = b'empty'

        if chunktype not in chunktypecounts:
            chunktypecounts[chunktype] = 0
            chunktypesizes[chunktype] = 0

        chunktypecounts[chunktype] += 1
        chunktypesizes[chunktype] += size

    # Adjust size min value for empty cases
    for size in (datasize, fullsize, semisize, deltasize):
        if size[0] is None:
            size[0] = 0

    numdeltas = numrevs - numfull - numempty - numsemi
    numoprev = numprev - nump1prev - nump2prev - numprev_nad
    num_other_ancestors = numother - numother_nad
    totalrawsize = datasize[2]
    datasize[2] /= numrevs
    fulltotal = fullsize[2]
    if numfull == 0:
        fullsize[2] = 0
    else:
        fullsize[2] /= numfull
    semitotal = semisize[2]
    snaptotal = {}
    if numsemi > 0:
        semisize[2] /= numsemi
    for depth in snapsizedepth:
        snaptotal[depth] = snapsizedepth[depth][2]
        snapsizedepth[depth][2] /= numsnapdepth[depth]

    deltatotal = deltasize[2]
    if numdeltas > 0:
        deltasize[2] /= numdeltas
    totalsize = fulltotal + semitotal + deltatotal
    avgchainlen = sum(chainlengths) / numrevs
    maxchainlen = max(chainlengths)
    maxchainspan = max(chainspans)
    compratio = 1
    if totalsize:
        compratio = totalrawsize / totalsize

    basedfmtstr = b'%%%dd\n'
    basepcfmtstr = b'%%%dd %s(%%5.2f%%%%)\n'

    def dfmtstr(max):
        return basedfmtstr % len(str(max))

    def pcfmtstr(max, padding=0):
        return basepcfmtstr % (len(str(max)), b' ' * padding)

    def pcfmt(value, total):
        if total:
            return (value, 100 * float(value) / total)
        else:
            return value, 100.0

    ui.writenoi18n(b'format : %d\n' % format)
    ui.writenoi18n(b'flags  : %s\n' % b', '.join(flags))

    ui.write(b'\n')
    fmt = pcfmtstr(totalsize)
    fmt2 = dfmtstr(totalsize)
    ui.writenoi18n(b'revisions     : ' + fmt2 % numrevs)
    ui.writenoi18n(b'    merges    : ' + fmt % pcfmt(nummerges, numrevs))
    ui.writenoi18n(
        b'    normal    : ' + fmt % pcfmt(numrevs - nummerges, numrevs)
    )
    ui.writenoi18n(b'revisions     : ' + fmt2 % numrevs)
    ui.writenoi18n(b'    empty     : ' + fmt % pcfmt(numempty, numrevs))
    ui.writenoi18n(
        b'                   text  : '
        + fmt % pcfmt(numemptytext, numemptytext + numemptydelta)
    )
    ui.writenoi18n(
        b'                   delta : '
        + fmt % pcfmt(numemptydelta, numemptytext + numemptydelta)
    )
    ui.writenoi18n(
        b'    snapshot  : ' + fmt % pcfmt(numfull + numsemi, numrevs)
    )
    for depth in sorted(numsnapdepth):
        base = b'      lvl-%-3d :       ' % depth
        count = fmt % pcfmt(numsnapdepth[depth], numrevs)
        pieces = [base, count]
        if numsnapdepth_nad[depth]:
            pieces[-1] = count = count[:-1]  # drop the final '\n'
            more = b'  non-ancestor-bases: '
            anc_count = fmt
            anc_count %= pcfmt(numsnapdepth_nad[depth], numsnapdepth[depth])
            pieces.append(more)
            pieces.append(anc_count)
        ui.write(b''.join(pieces))
    ui.writenoi18n(b'    deltas    : ' + fmt % pcfmt(numdeltas, numrevs))
    ui.writenoi18n(b'revision size : ' + fmt2 % totalsize)
    ui.writenoi18n(
        b'    snapshot  : ' + fmt % pcfmt(fulltotal + semitotal, totalsize)
    )
    for depth in sorted(numsnapdepth):
        ui.write(
            (b'      lvl-%-3d :       ' % depth)
            + fmt % pcfmt(snaptotal[depth], totalsize)
        )
    ui.writenoi18n(b'    deltas    : ' + fmt % pcfmt(deltatotal, totalsize))

    letters = string.ascii_letters.encode('ascii')

    def fmtchunktype(chunktype):
        if chunktype == b'empty':
            return b'    %s     : ' % chunktype
        elif chunktype in letters:
            return b'    0x%s (%s)  : ' % (nodemod.hex(chunktype), chunktype)
        else:
            return b'    0x%s      : ' % nodemod.hex(chunktype)

    ui.write(b'\n')
    ui.writenoi18n(b'chunks        : ' + fmt2 % numrevs)
    for chunktype in sorted(chunktypecounts):
        ui.write(fmtchunktype(chunktype))
        ui.write(fmt % pcfmt(chunktypecounts[chunktype], numrevs))
    ui.writenoi18n(b'chunks size   : ' + fmt2 % totalsize)
    for chunktype in sorted(chunktypecounts):
        ui.write(fmtchunktype(chunktype))
        ui.write(fmt % pcfmt(chunktypesizes[chunktype], totalsize))

    ui.write(b'\n')
    b_total = b"%d" % full_text_total_size
    p_total = []
    while len(b_total) > 3:
        p_total.append(b_total[-3:])
        b_total = b_total[:-3]
    p_total.append(b_total)
    p_total.reverse()
    b_total = b' '.join(p_total)

    ui.write(b'\n')
    ui.writenoi18n(b'total-stored-content: %s bytes\n' % b_total)
    ui.write(b'\n')
    fmt = dfmtstr(max(avgchainlen, maxchainlen, maxchainspan, compratio))
    ui.writenoi18n(b'avg chain length  : ' + fmt % avgchainlen)
    ui.writenoi18n(b'max chain length  : ' + fmt % maxchainlen)
    ui.writenoi18n(b'max chain reach   : ' + fmt % maxchainspan)
    ui.writenoi18n(b'compression ratio : ' + fmt % compratio)

    if format > 0:
        ui.write(b'\n')
        ui.writenoi18n(
            b'uncompressed data size (min/max/avg) : %d / %d / %d\n'
            % tuple(datasize)
        )
    ui.writenoi18n(
        b'full revision size (min/max/avg)     : %d / %d / %d\n'
        % tuple(fullsize)
    )
    ui.writenoi18n(
        b'inter-snapshot size (min/max/avg)    : %d / %d / %d\n'
        % tuple(semisize)
    )
    for depth in sorted(snapsizedepth):
        if depth == 0:
            continue
        ui.writenoi18n(
            b'    level-%-3d (min/max/avg)          : %d / %d / %d\n'
            % ((depth,) + tuple(snapsizedepth[depth]))
        )
    ui.writenoi18n(
        b'delta size (min/max/avg)             : %d / %d / %d\n'
        % tuple(deltasize)
    )

    if numdeltas > 0:
        ui.write(b'\n')
        fmt = pcfmtstr(numdeltas)
        fmt2 = pcfmtstr(numdeltas, 4)
        ui.writenoi18n(
            b'deltas against prev  : ' + fmt % pcfmt(numprev, numdeltas)
        )
        if numprev > 0:
            ui.writenoi18n(
                b'    where prev = p1  : ' + fmt2 % pcfmt(nump1prev, numprev)
            )
            ui.writenoi18n(
                b'    where prev = p2  : ' + fmt2 % pcfmt(nump2prev, numprev)
            )
            ui.writenoi18n(
                b'    other-ancestor   : ' + fmt2 % pcfmt(numoprev, numprev)
            )
            ui.writenoi18n(
                b'    unrelated        : ' + fmt2 % pcfmt(numoprev, numprev)
            )
        if gdelta:
            ui.writenoi18n(
                b'deltas against p1    : ' + fmt % pcfmt(nump1, numdeltas)
            )
            ui.writenoi18n(
                b'deltas against p2    : ' + fmt % pcfmt(nump2, numdeltas)
            )
            ui.writenoi18n(
                b'deltas against ancs  : '
                + fmt % pcfmt(num_other_ancestors, numdeltas)
            )
            ui.writenoi18n(
                b'deltas against other : '
                + fmt % pcfmt(numother_nad, numdeltas)
            )


def debug_delta_find(ui, revlog, rev, base_rev=nodemod.nullrev):
    """display the search process for a delta"""
    deltacomputer = deltautil.deltacomputer(
        revlog,
        write_debug=ui.write,
        debug_search=not ui.quiet,
    )

    node = revlog.node(rev)
    p1r, p2r = revlog.parentrevs(rev)
    p1 = revlog.node(p1r)
    p2 = revlog.node(p2r)
    full_text = revlog.revision(rev)
    btext = [full_text]
    textlen = len(btext[0])
    cachedelta = None
    flags = revlog.flags(rev)

    if base_rev != nodemod.nullrev:
        base_text = revlog.revision(base_rev)
        delta = mdiff.textdiff(base_text, full_text)

        cachedelta = (base_rev, delta, constants.DELTA_BASE_REUSE_TRY)
        btext = [None]

    revinfo = revlogutils.revisioninfo(
        node,
        p1,
        p2,
        btext,
        textlen,
        cachedelta,
        flags,
    )

    fh = revlog._datafp()
    deltacomputer.finddeltainfo(revinfo, fh, target_rev=rev)