mercurial/__init__.py
changeset 29550 1c22400db72d
parent 29490 b4d117cee636
child 29800 178c89e8519a
--- a/mercurial/__init__.py	Fri Jul 15 23:54:56 2016 +0900
+++ b/mercurial/__init__.py	Mon Jul 04 11:18:03 2016 -0700
@@ -121,9 +121,238 @@
         sys.modules[name] = mod
         return mod
 
+# Python 3 uses a custom module loader that transforms source code between
+# source file reading and compilation. This is done by registering a custom
+# finder that changes the spec for Mercurial modules to use a custom loader.
+if sys.version_info[0] >= 3:
+    from . import pure
+    import importlib
+    import io
+    import token
+    import tokenize
+
+    class hgpathentryfinder(importlib.abc.MetaPathFinder):
+        """A sys.meta_path finder that uses a custom module loader."""
+        def find_spec(self, fullname, path, target=None):
+            # Only handle Mercurial-related modules.
+            if not fullname.startswith(('mercurial.', 'hgext.', 'hgext3rd.')):
+                return None
+
+            # This assumes Python 3 doesn't support loading C modules.
+            if fullname in _dualmodules:
+                stem = fullname.split('.')[-1]
+                fullname = 'mercurial.pure.%s' % stem
+                target = pure
+                assert len(path) == 1
+                path = [os.path.join(path[0], 'pure')]
+
+            # Try to find the module using other registered finders.
+            spec = None
+            for finder in sys.meta_path:
+                if finder == self:
+                    continue
+
+                spec = finder.find_spec(fullname, path, target=target)
+                if spec:
+                    break
+
+            # This is a Mercurial-related module but we couldn't find it
+            # using the previously-registered finders. This likely means
+            # the module doesn't exist.
+            if not spec:
+                return None
+
+            if fullname.startswith('mercurial.pure.'):
+                spec.name = spec.name.replace('.pure.', '.')
+
+            # TODO need to support loaders from alternate specs, like zip
+            # loaders.
+            spec.loader = hgloader(spec.name, spec.origin)
+            return spec
+
+    def replacetokens(tokens):
+        """Transform a stream of tokens from raw to Python 3.
+
+        It is called by the custom module loading machinery to rewrite
+        source/tokens between source decoding and compilation.
+
+        Returns a generator of possibly rewritten tokens.
+
+        The input token list may be mutated as part of processing. However,
+        its changes do not necessarily match the output token stream.
+
+        REMEMBER TO CHANGE ``BYTECODEHEADER`` WHEN CHANGING THIS FUNCTION
+        OR CACHED FILES WON'T GET INVALIDATED PROPERLY.
+        """
+        for i, t in enumerate(tokens):
+            # Convert most string literals to byte literals. String literals
+            # in Python 2 are bytes. String literals in Python 3 are unicode.
+            # Most strings in Mercurial are bytes and unicode strings are rare.
+            # Rather than rewrite all string literals to use ``b''`` to indicate
+            # byte strings, we apply this token transformer to insert the ``b``
+            # prefix nearly everywhere.
+            if t.type == token.STRING:
+                s = t.string
+
+                # Preserve docstrings as string literals. This is inconsistent
+                # with regular unprefixed strings. However, the
+                # "from __future__" parsing (which allows a module docstring to
+                # exist before it) doesn't properly handle the docstring if it
+                # is b''' prefixed, leading to a SyntaxError. We leave all
+                # docstrings as unprefixed to avoid this. This means Mercurial
+                # components touching docstrings need to handle unicode,
+                # unfortunately.
+                if s[0:3] in ("'''", '"""'):
+                    yield t
+                    continue
+
+                # If the first character isn't a quote, it is likely a string
+                # prefixing character (such as 'b', 'u', or 'r'. Ignore.
+                if s[0] not in ("'", '"'):
+                    yield t
+                    continue
+
+                # String literal. Prefix to make a b'' string.
+                yield tokenize.TokenInfo(t.type, 'b%s' % s, t.start, t.end,
+                                          t.line)
+                continue
+
+            try:
+                nexttoken = tokens[i + 1]
+            except IndexError:
+                nexttoken = None
+
+            try:
+                prevtoken = tokens[i - 1]
+            except IndexError:
+                prevtoken = None
+
+            # This looks like a function call.
+            if (t.type == token.NAME and nexttoken and
+                nexttoken.type == token.OP and nexttoken.string == '('):
+                fn = t.string
+
+                # *attr() builtins don't accept byte strings to 2nd argument.
+                # Rewrite the token to include the unicode literal prefix so
+                # the string transformer above doesn't add the byte prefix.
+                if fn in ('getattr', 'setattr', 'hasattr', 'safehasattr'):
+                    try:
+                        # (NAME, 'getattr')
+                        # (OP, '(')
+                        # (NAME, 'foo')
+                        # (OP, ',')
+                        # (NAME|STRING, foo)
+                        st = tokens[i + 4]
+                        if (st.type == token.STRING and
+                            st.string[0] in ("'", '"')):
+                            rt = tokenize.TokenInfo(st.type, 'u%s' % st.string,
+                                                    st.start, st.end, st.line)
+                            tokens[i + 4] = rt
+                    except IndexError:
+                        pass
+
+                # .encode() and .decode() on str/bytes/unicode don't accept
+                # byte strings on Python 3. Rewrite the token to include the
+                # unicode literal prefix so the string transformer above doesn't
+                # add the byte prefix.
+                if (fn in ('encode', 'decode') and
+                    prevtoken.type == token.OP and prevtoken.string == '.'):
+                    # (OP, '.')
+                    # (NAME, 'encode')
+                    # (OP, '(')
+                    # (STRING, 'utf-8')
+                    # (OP, ')')
+                    try:
+                        st = tokens[i + 2]
+                        if (st.type == token.STRING and
+                            st.string[0] in ("'", '"')):
+                            rt = tokenize.TokenInfo(st.type, 'u%s' % st.string,
+                                                    st.start, st.end, st.line)
+                            tokens[i + 2] = rt
+                    except IndexError:
+                        pass
+
+            # Emit unmodified token.
+            yield t
+
+    # Header to add to bytecode files. This MUST be changed when
+    # ``replacetoken`` or any mechanism that changes semantics of module
+    # loading is changed. Otherwise cached bytecode may get loaded without
+    # the new transformation mechanisms applied.
+    BYTECODEHEADER = b'HG\x00\x01'
+
+    class hgloader(importlib.machinery.SourceFileLoader):
+        """Custom module loader that transforms source code.
+
+        When the source code is converted to a code object, we transform
+        certain patterns to be Python 3 compatible. This allows us to write code
+        that is natively Python 2 and compatible with Python 3 without
+        making the code excessively ugly.
+
+        We do this by transforming the token stream between parse and compile.
+
+        Implementing transformations invalidates caching assumptions made
+        by the built-in importer. The built-in importer stores a header on
+        saved bytecode files indicating the Python/bytecode version. If the
+        version changes, the cached bytecode is ignored. The Mercurial
+        transformations could change at any time. This means we need to check
+        that cached bytecode was generated with the current transformation
+        code or there could be a mismatch between cached bytecode and what
+        would be generated from this class.
+
+        We supplement the bytecode caching layer by wrapping ``get_data``
+        and ``set_data``. These functions are called when the
+        ``SourceFileLoader`` retrieves and saves bytecode cache files,
+        respectively. We simply add an additional header on the file. As
+        long as the version in this file is changed when semantics change,
+        cached bytecode should be invalidated when transformations change.
+
+        The added header has the form ``HG<VERSION>``. That is a literal
+        ``HG`` with 2 binary bytes indicating the transformation version.
+        """
+        def get_data(self, path):
+            data = super(hgloader, self).get_data(path)
+
+            if not path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
+                return data
+
+            # There should be a header indicating the Mercurial transformation
+            # version. If it doesn't exist or doesn't match the current version,
+            # we raise an OSError because that is what
+            # ``SourceFileLoader.get_code()`` expects when loading bytecode
+            # paths to indicate the cached file is "bad."
+            if data[0:2] != b'HG':
+                raise OSError('no hg header')
+            if data[0:4] != BYTECODEHEADER:
+                raise OSError('hg header version mismatch')
+
+            return data[4:]
+
+        def set_data(self, path, data, *args, **kwargs):
+            if path.endswith(tuple(importlib.machinery.BYTECODE_SUFFIXES)):
+                data = BYTECODEHEADER + data
+
+            return super(hgloader, self).set_data(path, data, *args, **kwargs)
+
+        def source_to_code(self, data, path):
+            """Perform token transformation before compilation."""
+            buf = io.BytesIO(data)
+            tokens = tokenize.tokenize(buf.readline)
+            data = tokenize.untokenize(replacetokens(list(tokens)))
+            # Python's built-in importer strips frames from exceptions raised
+            # for this code. Unfortunately, that mechanism isn't extensible
+            # and our frame will be blamed for the import failure. There
+            # are extremely hacky ways to do frame stripping. We haven't
+            # implemented them because they are very ugly.
+            return super(hgloader, self).source_to_code(data, path)
+
 # We automagically register our custom importer as a side-effect of loading.
 # This is necessary to ensure that any entry points are able to import
 # mercurial.* modules without having to perform this registration themselves.
-if not any(isinstance(x, hgimporter) for x in sys.meta_path):
+if sys.version_info[0] >= 3:
+    _importercls = hgpathentryfinder
+else:
+    _importercls = hgimporter
+if not any(isinstance(x, _importercls) for x in sys.meta_path):
     # meta_path is used before any implicit finders and before sys.path.
-    sys.meta_path.insert(0, hgimporter())
+    sys.meta_path.insert(0, _importercls())