revlog: use an LRU cache for delta chain bases
Profiling using statprof revealed a hotspot during changegroup
application calculating delta chain bases on generaldelta repos.
Essentially, revlog._addrevision() was performing a lot of redundant
work tracing the delta chain as part of determining when the chain
distance was acceptable. This was most pronounced when adding
revisions to manifests, which can have delta chains thousands of
revisions long.
There was a delta chain base cache on revlogs before, but it only
captured a single revision. This was acceptable before generaldelta,
when _addrevision would build deltas from the previous revision and
thus we'd pretty much guarantee a cache hit when resolving the delta
chain base on a subsequent _addrevision call. However, it isn't
suitable for generaldelta because parent revisions aren't necessarily
the last processed revision.
This patch converts the delta chain base cache to an LRU dict cache.
The cache can hold multiple entries, so generaldelta repos have a
higher chance of getting a cache hit.
The impact of this change when processing changegroup additions is
significant. On a generaldelta conversion of the "mozilla-unified"
repo (which contains heads of the main Firefox repositories in
chronological order - this means there are lots of transitions between
heads in revlog order), this change has the following impact when
performing an `hg unbundle` of an uncompressed bundle of the repo:
before: 5:42 CPU time
after: 4:34 CPU time
Most of this time is saved when applying the changelog and manifest
revlogs:
before: 2:30 CPU time
after: 1:17 CPU time
That nearly a 50% reduction in CPU time applying changesets and
manifests!
Applying a gzipped bundle of the same repo (effectively simulating a
`hg clone` over HTTP) showed a similar speedup:
before: 5:53 CPU time
after: 4:46 CPU time
Wall time improvements were basically the same as CPU time.
I didn't measure explicitly, but it feels like most of the time
is saved when processing manifests. This makes sense, as large
manifests tend to have very long delta chains and thus benefit the
most from this cache.
So, this change effectively makes changegroup application (which is
used by `hg unbundle`, `hg clone`, `hg pull`, `hg unshelve`, and
various other commands) significantly faster when delta chains are
long (which can happen on repos with large numbers of files and thus
large manifests).
In theory, this change can result in more memory utilization. However,
we're caching a dict of ints. At most we have 200 ints + Python object
overhead per revlog. And, the cache is really only populated when
performing read-heavy operations, such as adding changegroups or
scanning an individual revlog. For memory bloat to be an issue, we'd
need to scan/read several revisions from several revlogs all while
having active references to several revlogs. I don't think there are
many operations that do this, so I don't think memory bloat from the
cache will be an issue.
# formatter.py - generic output formatting for mercurial
#
# Copyright 2012 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
from __future__ import absolute_import
import os
from .i18n import _
from .node import (
hex,
short,
)
from . import (
encoding,
error,
templatekw,
templater,
util,
)
pickle = util.pickle
class baseformatter(object):
def __init__(self, ui, topic, opts):
self._ui = ui
self._topic = topic
self._style = opts.get("style")
self._template = opts.get("template")
self._item = None
# function to convert node to string suitable for this output
self.hexfunc = hex
def __nonzero__(self):
'''return False if we're not doing real templating so we can
skip extra work'''
return True
def _showitem(self):
'''show a formatted item once all data is collected'''
pass
def startitem(self):
'''begin an item in the format list'''
if self._item is not None:
self._showitem()
self._item = {}
@staticmethod
def formatdate(date, fmt='%a %b %d %H:%M:%S %Y %1%2'):
'''convert date tuple to appropriate format'''
return date
@staticmethod
def formatdict(data, key='key', value='value', fmt='%s=%s', sep=' '):
'''convert dict or key-value pairs to appropriate dict format'''
# use plain dict instead of util.sortdict so that data can be
# serialized as a builtin dict in pickle output
return dict(data)
@staticmethod
def formatlist(data, name, fmt='%s', sep=' '):
'''convert iterable to appropriate list format'''
return list(data)
def data(self, **data):
'''insert data into item that's not shown in default output'''
self._item.update(data)
def write(self, fields, deftext, *fielddata, **opts):
'''do default text output while assigning data to item'''
fieldkeys = fields.split()
assert len(fieldkeys) == len(fielddata)
self._item.update(zip(fieldkeys, fielddata))
def condwrite(self, cond, fields, deftext, *fielddata, **opts):
'''do conditional write (primarily for plain formatter)'''
fieldkeys = fields.split()
assert len(fieldkeys) == len(fielddata)
self._item.update(zip(fieldkeys, fielddata))
def plain(self, text, **opts):
'''show raw text for non-templated mode'''
pass
def end(self):
'''end output for the formatter'''
if self._item is not None:
self._showitem()
def _iteritems(data):
'''iterate key-value pairs in stable order'''
if isinstance(data, dict):
return sorted(data.iteritems())
return data
class plainformatter(baseformatter):
'''the default text output scheme'''
def __init__(self, ui, topic, opts):
baseformatter.__init__(self, ui, topic, opts)
if ui.debugflag:
self.hexfunc = hex
else:
self.hexfunc = short
def __nonzero__(self):
return False
def startitem(self):
pass
@staticmethod
def formatdate(date, fmt='%a %b %d %H:%M:%S %Y %1%2'):
'''stringify date tuple in the given format'''
return util.datestr(date, fmt)
@staticmethod
def formatdict(data, key='key', value='value', fmt='%s=%s', sep=' '):
'''stringify key-value pairs separated by sep'''
return sep.join(fmt % (k, v) for k, v in _iteritems(data))
@staticmethod
def formatlist(data, name, fmt='%s', sep=' '):
'''stringify iterable separated by sep'''
return sep.join(fmt % e for e in data)
def data(self, **data):
pass
def write(self, fields, deftext, *fielddata, **opts):
self._ui.write(deftext % fielddata, **opts)
def condwrite(self, cond, fields, deftext, *fielddata, **opts):
'''do conditional write'''
if cond:
self._ui.write(deftext % fielddata, **opts)
def plain(self, text, **opts):
self._ui.write(text, **opts)
def end(self):
pass
class debugformatter(baseformatter):
def __init__(self, ui, topic, opts):
baseformatter.__init__(self, ui, topic, opts)
self._ui.write("%s = [\n" % self._topic)
def _showitem(self):
self._ui.write(" " + repr(self._item) + ",\n")
def end(self):
baseformatter.end(self)
self._ui.write("]\n")
class pickleformatter(baseformatter):
def __init__(self, ui, topic, opts):
baseformatter.__init__(self, ui, topic, opts)
self._data = []
def _showitem(self):
self._data.append(self._item)
def end(self):
baseformatter.end(self)
self._ui.write(pickle.dumps(self._data))
def _jsonifyobj(v):
if isinstance(v, dict):
xs = ['"%s": %s' % (encoding.jsonescape(k), _jsonifyobj(u))
for k, u in sorted(v.iteritems())]
return '{' + ', '.join(xs) + '}'
elif isinstance(v, (list, tuple)):
return '[' + ', '.join(_jsonifyobj(e) for e in v) + ']'
elif v is None:
return 'null'
elif v is True:
return 'true'
elif v is False:
return 'false'
elif isinstance(v, (int, float)):
return str(v)
else:
return '"%s"' % encoding.jsonescape(v)
class jsonformatter(baseformatter):
def __init__(self, ui, topic, opts):
baseformatter.__init__(self, ui, topic, opts)
self._ui.write("[")
self._ui._first = True
def _showitem(self):
if self._ui._first:
self._ui._first = False
else:
self._ui.write(",")
self._ui.write("\n {\n")
first = True
for k, v in sorted(self._item.items()):
if first:
first = False
else:
self._ui.write(",\n")
self._ui.write(' "%s": %s' % (k, _jsonifyobj(v)))
self._ui.write("\n }")
def end(self):
baseformatter.end(self)
self._ui.write("\n]\n")
class templateformatter(baseformatter):
def __init__(self, ui, topic, opts):
baseformatter.__init__(self, ui, topic, opts)
self._topic = topic
self._t = gettemplater(ui, topic, opts.get('template', ''))
def _showitem(self):
g = self._t(self._topic, ui=self._ui, **self._item)
self._ui.write(templater.stringify(g))
@staticmethod
def formatdict(data, key='key', value='value', fmt='%s=%s', sep=' '):
'''build object that can be evaluated as either plain string or dict'''
data = util.sortdict(_iteritems(data))
def f():
yield plainformatter.formatdict(data, key, value, fmt, sep)
return templatekw._hybrid(f(), data, lambda k: {key: k, value: data[k]},
lambda d: fmt % (d[key], d[value]))
@staticmethod
def formatlist(data, name, fmt='%s', sep=' '):
'''build object that can be evaluated as either plain string or list'''
# name is mandatory argument for now, but it could be optional if
# we have default template keyword, e.g. {item}
data = list(data)
def f():
yield plainformatter.formatlist(data, name, fmt, sep)
return templatekw._hybrid(f(), data, lambda x: {name: x},
lambda d: fmt % d[name])
def lookuptemplate(ui, topic, tmpl):
# looks like a literal template?
if '{' in tmpl:
return tmpl, None
# perhaps a stock style?
if not os.path.split(tmpl)[0]:
mapname = (templater.templatepath('map-cmdline.' + tmpl)
or templater.templatepath(tmpl))
if mapname and os.path.isfile(mapname):
return None, mapname
# perhaps it's a reference to [templates]
t = ui.config('templates', tmpl)
if t:
return templater.unquotestring(t), None
if tmpl == 'list':
ui.write(_("available styles: %s\n") % templater.stylelist())
raise error.Abort(_("specify a template"))
# perhaps it's a path to a map or a template
if ('/' in tmpl or '\\' in tmpl) and os.path.isfile(tmpl):
# is it a mapfile for a style?
if os.path.basename(tmpl).startswith("map-"):
return None, os.path.realpath(tmpl)
tmpl = open(tmpl).read()
return tmpl, None
# constant string?
return tmpl, None
def gettemplater(ui, topic, spec):
tmpl, mapfile = lookuptemplate(ui, topic, spec)
assert not (tmpl and mapfile)
if mapfile:
return templater.templater.frommapfile(mapfile)
return maketemplater(ui, topic, tmpl)
def maketemplater(ui, topic, tmpl, filters=None, cache=None):
"""Create a templater from a string template 'tmpl'"""
aliases = ui.configitems('templatealias')
t = templater.templater(filters=filters, cache=cache, aliases=aliases)
if tmpl:
t.cache[topic] = tmpl
return t
def formatter(ui, topic, opts):
template = opts.get("template", "")
if template == "json":
return jsonformatter(ui, topic, opts)
elif template == "pickle":
return pickleformatter(ui, topic, opts)
elif template == "debug":
return debugformatter(ui, topic, opts)
elif template != "":
return templateformatter(ui, topic, opts)
# developer config: ui.formatdebug
elif ui.configbool('ui', 'formatdebug'):
return debugformatter(ui, topic, opts)
# deprecated config: ui.formatjson
elif ui.configbool('ui', 'formatjson'):
return jsonformatter(ui, topic, opts)
return plainformatter(ui, topic, opts)