view mercurial/changelog.py @ 51983:46afce95e5a5

tests: skip `test-wsgicgi.t` on MSYS The test is attempting to set `PATH_INFO="/rev/\xe2\x80\x94"` into the environment, which it does. The problem is that when MSYS sees a leading '/' in an environment variable, it thinks it's a unix filesystem path, so it "helpfully" prepends the Windows path to the MSYS root directory before running a non-MSYS process. hgweb would then split this value on '/', so it would get 'C:' instead of 'rev', and return a 400 since that isn't a valid web command. I tried generating a *.bat file, but had trouble running that via `cmd.exe` inside the test. I also tried generating an equivalent *.py launcher that would set the environment variables itself. But there is no `os.environb` on Windows, and the value was getting mangled when put into the script. So, I give up. If it's encoding stuff on Windows, it's probably broken.
author Matt Harbison <matt_harbison@yahoo.com>
date Mon, 07 Oct 2024 13:19:16 -0400
parents f4733654f144
children 13815c9decd4
line wrap: on
line source

# changelog.py - changelog class for mercurial
#
# Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from __future__ import annotations

import typing

from .i18n import _
from .node import (
    bin,
    hex,
)
from .thirdparty import attr

# Force pytype to use the non-vendored package
if typing.TYPE_CHECKING:
    # noinspection PyPackageRequirements
    import attr

from . import (
    encoding,
    error,
    metadata,
    pycompat,
    revlog,
)
from .utils import (
    dateutil,
    stringutil,
)
from .revlogutils import (
    constants as revlog_constants,
    flagutil,
)

_defaultextra = {b'branch': b'default'}


def _string_escape(text):
    """
    >>> from .pycompat import bytechr as chr
    >>> d = {b'nl': chr(10), b'bs': chr(92), b'cr': chr(13), b'nul': chr(0)}
    >>> s = b"ab%(nl)scd%(bs)s%(bs)sn%(nul)s12ab%(cr)scd%(bs)s%(nl)s" % d
    >>> s
    'ab\\ncd\\\\\\\\n\\x0012ab\\rcd\\\\\\n'
    >>> res = _string_escape(s)
    >>> s == _string_unescape(res)
    True
    """
    # subset of the string_escape codec
    text = (
        text.replace(b'\\', b'\\\\')
        .replace(b'\n', b'\\n')
        .replace(b'\r', b'\\r')
    )
    return text.replace(b'\0', b'\\0')


def _string_unescape(text):
    if b'\\0' in text:
        # fix up \0 without getting into trouble with \\0
        text = text.replace(b'\\\\', b'\\\\\n')
        text = text.replace(b'\\0', b'\0')
        text = text.replace(b'\n', b'')
    return stringutil.unescapestr(text)


def decodeextra(text):
    """
    >>> from .pycompat import bytechr as chr
    >>> sorted(decodeextra(encodeextra({b'foo': b'bar', b'baz': chr(0) + b'2'})
    ...                    ).items())
    [('baz', '\\x002'), ('branch', 'default'), ('foo', 'bar')]
    >>> sorted(decodeextra(encodeextra({b'foo': b'bar',
    ...                                 b'baz': chr(92) + chr(0) + b'2'})
    ...                    ).items())
    [('baz', '\\\\\\x002'), ('branch', 'default'), ('foo', 'bar')]
    """
    extra = _defaultextra.copy()
    for l in text.split(b'\0'):
        if l:
            k, v = _string_unescape(l).split(b':', 1)
            extra[k] = v
    return extra


def encodeextra(d):
    # keys must be sorted to produce a deterministic changelog entry
    items = [_string_escape(b'%s:%s' % (k, d[k])) for k in sorted(d)]
    return b"\0".join(items)


def stripdesc(desc):
    """strip trailing whitespace and leading and trailing empty lines"""
    return b'\n'.join([l.rstrip() for l in desc.splitlines()]).strip(b'\n')


@attr.s
class _changelogrevision:
    # Extensions might modify _defaultextra, so let the constructor below pass
    # it in
    extra = attr.ib()
    manifest = attr.ib()
    user = attr.ib(default=b'')
    date = attr.ib(default=(0, 0))
    files = attr.ib(default=attr.Factory(list))
    filesadded = attr.ib(default=None)
    filesremoved = attr.ib(default=None)
    p1copies = attr.ib(default=None)
    p2copies = attr.ib(default=None)
    description = attr.ib(default=b'')
    branchinfo = attr.ib(default=(_defaultextra[b'branch'], False))


class changelogrevision:
    """Holds results of a parsed changelog revision.

    Changelog revisions consist of multiple pieces of data, including
    the manifest node, user, and date. This object exposes a view into
    the parsed object.
    """

    __slots__ = (
        '_offsets',
        '_text',
        '_sidedata',
        '_cpsd',
        '_changes',
    )

    def __new__(cls, cl, text, sidedata, cpsd):
        if not text:
            return _changelogrevision(extra=_defaultextra, manifest=cl.nullid)

        self = super(changelogrevision, cls).__new__(cls)
        # We could return here and implement the following as an __init__.
        # But doing it here is equivalent and saves an extra function call.

        # format used:
        # nodeid\n        : manifest node in ascii
        # user\n          : user, no \n or \r allowed
        # time tz extra\n : date (time is int or float, timezone is int)
        #                 : extra is metadata, encoded and separated by '\0'
        #                 : older versions ignore it
        # files\n\n       : files modified by the cset, no \n or \r allowed
        # (.*)            : comment (free text, ideally utf-8)
        #
        # changelog v0 doesn't use extra

        nl1 = text.index(b'\n')
        nl2 = text.index(b'\n', nl1 + 1)
        nl3 = text.index(b'\n', nl2 + 1)

        # The list of files may be empty. Which means nl3 is the first of the
        # double newline that precedes the description.
        if text[nl3 + 1 : nl3 + 2] == b'\n':
            doublenl = nl3
        else:
            doublenl = text.index(b'\n\n', nl3 + 1)

        self._offsets = (nl1, nl2, nl3, doublenl)
        self._text = text
        self._sidedata = sidedata
        self._cpsd = cpsd
        self._changes = None

        return self

    @property
    def manifest(self):
        return bin(self._text[0 : self._offsets[0]])

    @property
    def user(self):
        off = self._offsets
        return encoding.tolocal(self._text[off[0] + 1 : off[1]])

    @property
    def _rawdate(self):
        off = self._offsets
        dateextra = self._text[off[1] + 1 : off[2]]
        return dateextra.split(b' ', 2)[0:2]

    @property
    def _rawextra(self):
        off = self._offsets
        dateextra = self._text[off[1] + 1 : off[2]]
        fields = dateextra.split(b' ', 2)
        if len(fields) != 3:
            return None

        return fields[2]

    @property
    def date(self):
        raw = self._rawdate
        time = float(raw[0])
        # Various tools did silly things with the timezone.
        try:
            timezone = int(raw[1])
        except ValueError:
            timezone = 0

        return time, timezone

    @property
    def extra(self):
        raw = self._rawextra
        if raw is None:
            return _defaultextra

        return decodeextra(raw)

    @property
    def changes(self):
        if self._changes is not None:
            return self._changes
        if self._cpsd:
            changes = metadata.decode_files_sidedata(self._sidedata)
        else:
            changes = metadata.ChangingFiles(
                touched=self.files or (),
                added=self.filesadded or (),
                removed=self.filesremoved or (),
                p1_copies=self.p1copies or {},
                p2_copies=self.p2copies or {},
            )
        self._changes = changes
        return changes

    @property
    def files(self):
        if self._cpsd:
            return sorted(self.changes.touched)
        off = self._offsets
        if off[2] == off[3]:
            return []

        return self._text[off[2] + 1 : off[3]].split(b'\n')

    @property
    def filesadded(self):
        if self._cpsd:
            return self.changes.added
        else:
            rawindices = self.extra.get(b'filesadded')
        if rawindices is None:
            return None
        return metadata.decodefileindices(self.files, rawindices)

    @property
    def filesremoved(self):
        if self._cpsd:
            return self.changes.removed
        else:
            rawindices = self.extra.get(b'filesremoved')
        if rawindices is None:
            return None
        return metadata.decodefileindices(self.files, rawindices)

    @property
    def p1copies(self):
        if self._cpsd:
            return self.changes.copied_from_p1
        else:
            rawcopies = self.extra.get(b'p1copies')
        if rawcopies is None:
            return None
        return metadata.decodecopies(self.files, rawcopies)

    @property
    def p2copies(self):
        if self._cpsd:
            return self.changes.copied_from_p2
        else:
            rawcopies = self.extra.get(b'p2copies')
        if rawcopies is None:
            return None
        return metadata.decodecopies(self.files, rawcopies)

    @property
    def description(self):
        return encoding.tolocal(self._text[self._offsets[3] + 2 :])

    @property
    def branchinfo(self):
        extra = self.extra
        return encoding.tolocal(extra.get(b"branch")), b'close' in extra


class changelog(revlog.revlog):
    def __init__(self, opener, trypending=False, concurrencychecker=None):
        """Load a changelog revlog using an opener.

        If ``trypending`` is true, we attempt to load the index from a
        ``00changelog.i.a`` file instead of the default ``00changelog.i``.
        The ``00changelog.i.a`` file contains index (and possibly inline
        revision) data for a transaction that hasn't been finalized yet.
        It exists in a separate file to facilitate readers (such as
        hooks processes) accessing data before a transaction is finalized.

        ``concurrencychecker`` will be passed to the revlog init function, see
        the documentation there.
        """
        revlog.revlog.__init__(
            self,
            opener,
            target=(revlog_constants.KIND_CHANGELOG, None),
            radix=b'00changelog',
            checkambig=True,
            mmaplargeindex=True,
            persistentnodemap=opener.options.get(b'persistent-nodemap', False),
            concurrencychecker=concurrencychecker,
            trypending=trypending,
            may_inline=False,
        )

        if self._initempty and (self._format_version == revlog.REVLOGV1):
            # changelogs don't benefit from generaldelta.

            self._format_flags &= ~revlog.FLAG_GENERALDELTA
            self.delta_config.general_delta = False

        # Delta chains for changelogs tend to be very small because entries
        # tend to be small and don't delta well with each. So disable delta
        # chains.
        self._storedeltachains = False

        self._v2_delayed = False
        self._filteredrevs = frozenset()
        self._filteredrevs_hashcache = {}
        self._copiesstorage = opener.options.get(b'copies-storage')

    def __contains__(self, rev):
        return (0 <= rev < len(self)) and rev not in self._filteredrevs

    @property
    def filteredrevs(self):
        return self._filteredrevs

    @filteredrevs.setter
    def filteredrevs(self, val):
        # Ensure all updates go through this function
        assert isinstance(val, frozenset)
        self._filteredrevs = val
        self._filteredrevs_hashcache = {}

    def _write_docket(self, tr):
        if not self._v2_delayed:
            super(changelog, self)._write_docket(tr)

    def delayupdate(self, tr):
        """delay visibility of index updates to other readers"""
        assert not self._inner.is_open
        assert not self._may_inline
        # enforce that older changelog that are still inline are split at the
        # first opportunity.
        if self._inline:
            self._enforceinlinesize(tr)
        if self._docket is not None:
            self._v2_delayed = True
        else:
            new_index = self._inner.delay()
            if new_index is not None:
                self._indexfile = new_index
                tr.registertmp(new_index)
        # use "000" as prefix to make sure we run before the spliting of legacy
        # inline changelog..
        tr.addpending(b'000-cl-%i' % id(self), self._writepending)
        tr.addfinalize(b'000-cl-%i' % id(self), self._finalize)

    def _finalize(self, tr):
        """finalize index updates"""
        assert not self._inner.is_open
        if self._docket is not None:
            self._docket.write(tr)
            self._v2_delayed = False
        else:
            new_index_file = self._inner.finalize_pending()
            self._indexfile = new_index_file
            if self._inline:
                msg = 'changelog should not be inline at that point'
                raise error.ProgrammingError(msg)

    def _writepending(self, tr):
        """create a file containing the unfinalized state for
        pretxnchangegroup"""
        assert not self._inner.is_open
        if self._docket:
            any_pending = self._docket.write(tr, pending=True)
            self._v2_delayed = False
        else:
            new_index, any_pending = self._inner.write_pending()
            if new_index is not None:
                self._indexfile = new_index
                tr.registertmp(new_index)
        return any_pending

    def _enforceinlinesize(self, tr):
        if not self.is_delaying:
            revlog.revlog._enforceinlinesize(self, tr)

    def read(self, nodeorrev):
        """Obtain data from a parsed changelog revision.

        Returns a 6-tuple of:

           - manifest node in binary
           - author/user as a localstr
           - date as a 2-tuple of (time, timezone)
           - list of files
           - commit message as a localstr
           - dict of extra metadata

        Unless you need to access all fields, consider calling
        ``changelogrevision`` instead, as it is faster for partial object
        access.
        """
        d = self._revisiondata(nodeorrev)
        sidedata = self.sidedata(nodeorrev)
        copy_sd = self._copiesstorage == b'changeset-sidedata'
        c = changelogrevision(self, d, sidedata, copy_sd)
        return (c.manifest, c.user, c.date, c.files, c.description, c.extra)

    def changelogrevision(self, nodeorrev):
        """Obtain a ``changelogrevision`` for a node or revision."""
        text = self._revisiondata(nodeorrev)
        sidedata = self.sidedata(nodeorrev)
        return changelogrevision(
            self, text, sidedata, self._copiesstorage == b'changeset-sidedata'
        )

    def readfiles(self, nodeorrev):
        """
        short version of read that only returns the files modified by the cset
        """
        text = self.revision(nodeorrev)
        if not text:
            return []
        last = text.index(b"\n\n")
        l = text[:last].split(b'\n')
        return l[3:]

    def add(
        self,
        manifest,
        files,
        desc,
        transaction,
        p1,
        p2,
        user,
        date=None,
        extra=None,
    ):
        # Convert to UTF-8 encoded bytestrings as the very first
        # thing: calling any method on a localstr object will turn it
        # into a str object and the cached UTF-8 string is thus lost.
        user, desc = encoding.fromlocal(user), encoding.fromlocal(desc)

        user = user.strip()
        # An empty username or a username with a "\n" will make the
        # revision text contain two "\n\n" sequences -> corrupt
        # repository since read cannot unpack the revision.
        if not user:
            raise error.StorageError(_(b"empty username"))
        if b"\n" in user:
            raise error.StorageError(
                _(b"username %r contains a newline") % pycompat.bytestr(user)
            )

        desc = stripdesc(desc)

        if date:
            parseddate = b"%d %d" % dateutil.parsedate(date)
        else:
            parseddate = b"%d %d" % dateutil.makedate()
        if extra:
            branch = extra.get(b"branch")
            if branch in (b"default", b""):
                del extra[b"branch"]
            elif branch in (b".", b"null", b"tip"):
                raise error.StorageError(
                    _(b'the name \'%s\' is reserved') % branch
                )
        sortedfiles = sorted(files.touched)
        flags = 0
        sidedata = None
        if self._copiesstorage == b'changeset-sidedata':
            if files.has_copies_info:
                flags |= flagutil.REVIDX_HASCOPIESINFO
            sidedata = metadata.encode_files_sidedata(files)

        if extra:
            extra = encodeextra(extra)
            parseddate = b"%s %s" % (parseddate, extra)
        l = [hex(manifest), user, parseddate] + sortedfiles + [b"", desc]
        text = b"\n".join(l)
        rev = self.addrevision(
            text, transaction, len(self), p1, p2, sidedata=sidedata, flags=flags
        )
        return self.node(rev)

    def branchinfo(self, rev):
        """return the branch name and open/close state of a revision

        This function exists because creating a changectx object
        just to access this is costly."""
        return self.changelogrevision(rev).branchinfo

    def _nodeduplicatecallback(self, transaction, rev):
        # keep track of revisions that got "re-added", eg: unbunde of know rev.
        #
        # We track them in a list to preserve their order from the source bundle
        duplicates = transaction.changes.setdefault(b'revduplicates', [])
        duplicates.append(rev)