i18n: cache the result of every gettext call
In looking at profiler output for 'hg log' on mozilla-central, I
noticed we spent a _huge_ amount of time in gettext relative to what
it's doing. Caching provides a roughly 15% performance improvement
even on repositories as small as hg.
== hg repo on linux ==
Before:
% cumulative self
time seconds seconds name
5.05 0.19 0.19 i18n.py:62:gettext
4.84 0.18 0.18 revlog.py:88:decompress
2.95 0.17 0.11 changelog.py:201:node
2.32 0.09 0.09 ui.py:577:write
2.11 0.08 0.08 i18n.py:72:gettext
2.11 0.08 0.08 obsolete.py:196:_fm0readmarkers
1.89 0.07 0.07 obsolete.py:569:_load
1.68 0.63 0.06 localrepo.py:29:__get__
real 0m4.026s
user 0m3.993s
sys 0m0.034s
After:
% cumulative self
time seconds seconds name
8.05 0.26 0.26 revlog.py:88:decompress
2.68 0.22 0.09 color.py:395:write
2.20 0.07 0.07 obsolete.py:196:_fm0readmarkers
1.95 0.06 0.06 obsolete.py:174:_fm0readmarkers
1.95 0.06 0.06 ui.py:577:write
1.95 0.06 0.06 util.py:1228:datestr
1.71 0.06 0.06 utf_8.py:16:decode
1.71 0.06 0.06 revlog.py:273:__len__
real 0m3.519s
user 0m3.447s
sys 0m0.073s
== mozilla-central repo on linux ==
Before:
% cumulative self
time seconds seconds name
7.72 2.35 2.35 revlog.py:88:decompress
4.46 1.36 1.36 i18n.py:62:gettext
2.22 0.67 0.67 i18n.py:72:gettext
2.19 1.14 0.67 changelog.py:201:node
2.16 0.66 0.66 ui.py:577:write
1.96 0.60 0.60 utf_8.py:16:decode
1.93 1.97 0.59 color.py:395:write
1.85 0.81 0.56 changelog.py:136:tip
real 0m30.822s
user 0m30.660s
sys 0m0.149s
After:
% cumulative self
time seconds seconds name
9.82 2.49 2.49 revlog.py:88:decompress
2.67 1.31 0.68 localrepo.py:29:__get__
2.57 0.65 0.65 utf_8.py:16:decode
2.48 1.01 0.63 changelog.py:201:node
2.10 0.82 0.53 changelog.py:136:tip
2.01 0.51 0.51 ui.py:577:write
1.91 0.49 0.49 util.py:1232:datestr
1.85 1.65 0.47 color.py:395:write
real 0m25.619s
user 0m25.446s
sys 0m0.166s
== cpython repo on os x =
Before:
% cumulative self
time seconds seconds name
5.05 1.35 1.35 cmdutil.py:982:_show
4.59 1.22 1.22 revlog.py:274:__len__
3.98 1.06 1.06 i18n.py:62:gettext
3.91 1.04 1.04 revlog.py:1016:revision
3.68 0.98 0.98 revlog.py:337:parents
3.45 0.92 0.92 revlog.py:88:decompress
2.91 0.78 0.78 revlog.py:309:rev
2.62 0.70 0.70 revlog.py:1033:revision
real 0m30.414s
user 0m28.145s
sys 0m0.541s
After:
% cumulative self
time seconds seconds name
7.98 1.66 1.66 cmdutil.py:982:_show
6.83 1.42 1.42 changelog.py:46:decodeextra
5.18 1.08 1.08 revlog.py:274:__len__
3.94 0.82 0.82 revlog.py:1016:revision
3.41 0.71 0.71 revlog.py:309:rev
3.32 0.69 0.69 revlog.py:88:decompress
2.99 0.63 0.62 revlog.py:1033:revision
2.69 0.56 0.56 revlog.py:341:start
real 0m22.811s
user 0m21.883s
sys 0m0.397s
import ast
import os
import sys
# Import a minimal set of stdlib modules needed for list_stdlib_modules()
# to work when run from a virtualenv. The modules were chosen empirically
# so that the return value matches the return value without virtualenv.
import BaseHTTPServer
import zlib
def dotted_name_of_path(path, trimpure=False):
"""Given a relative path to a source file, return its dotted module name.
>>> dotted_name_of_path('mercurial/error.py')
'mercurial.error'
>>> dotted_name_of_path('mercurial/pure/parsers.py', trimpure=True)
'mercurial.parsers'
>>> dotted_name_of_path('zlibmodule.so')
'zlib'
"""
parts = path.split('/')
parts[-1] = parts[-1].split('.', 1)[0] # remove .py and .so and .ARCH.so
if parts[-1].endswith('module'):
parts[-1] = parts[-1][:-6]
if trimpure:
return '.'.join(p for p in parts if p != 'pure')
return '.'.join(parts)
def list_stdlib_modules():
"""List the modules present in the stdlib.
>>> mods = set(list_stdlib_modules())
>>> 'BaseHTTPServer' in mods
True
os.path isn't really a module, so it's missing:
>>> 'os.path' in mods
False
sys requires special treatment, because it's baked into the
interpreter, but it should still appear:
>>> 'sys' in mods
True
>>> 'collections' in mods
True
>>> 'cStringIO' in mods
True
"""
for m in sys.builtin_module_names:
yield m
# These modules only exist on windows, but we should always
# consider them stdlib.
for m in ['msvcrt', '_winreg']:
yield m
# These get missed too
for m in 'ctypes', 'email':
yield m
yield 'builtins' # python3 only
stdlib_prefixes = set([sys.prefix, sys.exec_prefix])
# We need to supplement the list of prefixes for the search to work
# when run from within a virtualenv.
for mod in (BaseHTTPServer, zlib):
try:
# Not all module objects have a __file__ attribute.
filename = mod.__file__
except AttributeError:
continue
dirname = os.path.dirname(filename)
for prefix in stdlib_prefixes:
if dirname.startswith(prefix):
# Then this directory is redundant.
break
else:
stdlib_prefixes.add(dirname)
for libpath in sys.path:
# We want to walk everything in sys.path that starts with
# something in stdlib_prefixes. check-code suppressed because
# the ast module used by this script implies the availability
# of any().
if not any(libpath.startswith(p) for p in stdlib_prefixes): # no-py24
continue
if 'site-packages' in libpath:
continue
for top, dirs, files in os.walk(libpath):
for name in files:
if name == '__init__.py':
continue
if not (name.endswith('.py') or name.endswith('.so')):
continue
full_path = os.path.join(top, name)
if 'site-packages' in full_path:
continue
rel_path = full_path[len(libpath) + 1:]
mod = dotted_name_of_path(rel_path)
yield mod
stdlib_modules = set(list_stdlib_modules())
def imported_modules(source, ignore_nested=False):
"""Given the source of a file as a string, yield the names
imported by that file.
Args:
source: The python source to examine as a string.
ignore_nested: If true, import statements that do not start in
column zero will be ignored.
Returns:
A list of module names imported by the given source.
>>> sorted(imported_modules(
... 'import foo ; from baz import bar; import foo.qux'))
['baz.bar', 'foo', 'foo.qux']
>>> sorted(imported_modules(
... '''import foo
... def wat():
... import bar
... ''', ignore_nested=True))
['foo']
"""
for node in ast.walk(ast.parse(source)):
if ignore_nested and getattr(node, 'col_offset', 0) > 0:
continue
if isinstance(node, ast.Import):
for n in node.names:
yield n.name
elif isinstance(node, ast.ImportFrom):
prefix = node.module + '.'
for n in node.names:
yield prefix + n.name
def verify_stdlib_on_own_line(source):
"""Given some python source, verify that stdlib imports are done
in separate statements from relative local module imports.
Observing this limitation is important as it works around an
annoying lib2to3 bug in relative import rewrites:
http://bugs.python.org/issue19510.
>>> list(verify_stdlib_on_own_line('import sys, foo'))
['mixed imports\\n stdlib: sys\\n relative: foo']
>>> list(verify_stdlib_on_own_line('import sys, os'))
[]
>>> list(verify_stdlib_on_own_line('import foo, bar'))
[]
"""
for node in ast.walk(ast.parse(source)):
if isinstance(node, ast.Import):
from_stdlib = {False: [], True: []}
for n in node.names:
from_stdlib[n.name in stdlib_modules].append(n.name)
if from_stdlib[True] and from_stdlib[False]:
yield ('mixed imports\n stdlib: %s\n relative: %s' %
(', '.join(sorted(from_stdlib[True])),
', '.join(sorted(from_stdlib[False]))))
class CircularImport(Exception):
pass
def cyclekey(names):
return tuple(sorted(set(names)))
def check_one_mod(mod, imports, path=None, ignore=None):
if path is None:
path = []
if ignore is None:
ignore = []
path = path + [mod]
for i in sorted(imports.get(mod, [])):
if i not in stdlib_modules and not i.startswith('mercurial.'):
i = mod.rsplit('.', 1)[0] + '.' + i
if i in path:
firstspot = path.index(i)
cycle = path[firstspot:] + [i]
if cyclekey(cycle) not in ignore:
raise CircularImport(cycle)
continue
check_one_mod(i, imports, path=path, ignore=ignore)
def rotatecycle(cycle):
"""arrange a cycle so that the lexicographically first module listed first
>>> rotatecycle(['foo', 'bar', 'foo'])
['bar', 'foo', 'bar']
"""
lowest = min(cycle)
idx = cycle.index(lowest)
return cycle[idx:] + cycle[1:idx] + [lowest]
def find_cycles(imports):
"""Find cycles in an already-loaded import graph.
>>> imports = {'top.foo': ['bar', 'os.path', 'qux'],
... 'top.bar': ['baz', 'sys'],
... 'top.baz': ['foo'],
... 'top.qux': ['foo']}
>>> print '\\n'.join(sorted(find_cycles(imports)))
top.bar -> top.baz -> top.foo -> top.bar -> top.bar
top.foo -> top.qux -> top.foo -> top.foo
"""
cycles = {}
for mod in sorted(imports.iterkeys()):
try:
check_one_mod(mod, imports, ignore=cycles)
except CircularImport, e:
cycle = e.args[0]
cycles[cyclekey(cycle)] = ' -> '.join(rotatecycle(cycle))
return cycles.values()
def _cycle_sortkey(c):
return len(c), c
def main(argv):
if len(argv) < 2:
print 'Usage: %s file [file] [file] ...'
return 1
used_imports = {}
any_errors = False
for source_path in argv[1:]:
f = open(source_path)
modname = dotted_name_of_path(source_path, trimpure=True)
src = f.read()
used_imports[modname] = sorted(
imported_modules(src, ignore_nested=True))
for error in verify_stdlib_on_own_line(src):
any_errors = True
print source_path, error
f.close()
cycles = find_cycles(used_imports)
if cycles:
firstmods = set()
for c in sorted(cycles, key=_cycle_sortkey):
first = c.split()[0]
# As a rough cut, ignore any cycle that starts with the
# same module as some other cycle. Otherwise we see lots
# of cycles that are effectively duplicates.
if first in firstmods:
continue
print 'Import cycle:', c
firstmods.add(first)
any_errors = True
return not any_errors
if __name__ == '__main__':
sys.exit(int(main(sys.argv)))