util: improve iterfile so it chooses code path wisely
authorJun Wu <quark@fb.com>
Tue, 15 Nov 2016 20:25:51 +0000
changeset 30418 1156ec81f709
parent 30417 854190becacb
child 30420 270b077d434b
util: improve iterfile so it chooses code path wisely We have performance concerns on "iterfile" as it is 4X slower on normal files. While modern systems have the nice property that reading a "fast" (on-disk) file cannot be interrupted and should be made use of. This patch dumps the related knowledge in comments. And "iterfile" chooses code paths wisely: 1. If it's CPython 3, or PyPY, use the fast path. 2. If fp is a normal file, use the fast path. 3. If fp is not a normal file and CPython version >= 2.7.4, use the same workaround (4x slower) as before. 4. If fp is not a normal file and CPython version < 2.7.4, use another workaround (2x slower but may block longer then necessary) which basically re-invents the buffer + readline logic in Python. This will give us good confidence on both correctness and performance dealing with EINTR in iterfile(fp) for all known supported Python versions.
mercurial/util.py
--- a/mercurial/util.py	Wed Nov 16 23:29:28 2016 -0500
+++ b/mercurial/util.py	Tue Nov 15 20:25:51 2016 +0000
@@ -24,10 +24,12 @@
 import hashlib
 import imp
 import os
+import platform as pyplatform
 import re as remod
 import shutil
 import signal
 import socket
+import stat
 import string
 import subprocess
 import sys
@@ -2208,10 +2210,77 @@
                             subsequent_indent=hangindent)
     return wrapper.fill(line).encode(encoding.encoding)
 
-def iterfile(fp):
-    """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is
-    known to have such issues."""
-    return iter(fp.readline, '')
+if (pyplatform.python_implementation() == 'CPython' and
+    sys.version_info < (3, 0)):
+    # There is an issue in CPython that some IO methods do not handle EINTR
+    # correctly. The following table shows what CPython version (and functions)
+    # are affected (buggy: has the EINTR bug, okay: otherwise):
+    #
+    #                | < 2.7.4 | 2.7.4 to 2.7.12 | >= 3.0
+    #   --------------------------------------------------
+    #    fp.__iter__ | buggy   | buggy           | okay
+    #    fp.read*    | buggy   | okay [1]        | okay
+    #
+    # [1]: fixed by changeset 67dc99a989cd in the cpython hg repo.
+    #
+    # Here we workaround the EINTR issue for fileobj.__iter__. Other methods
+    # like "read*" are ignored for now, as Python < 2.7.4 is a minority.
+    #
+    # Although we can workaround the EINTR issue for fp.__iter__, it is slower:
+    # "for x in fp" is 4x faster than "for x in iter(fp.readline, '')" in
+    # CPython 2, because CPython 2 maintains an internal readahead buffer for
+    # fp.__iter__ but not other fp.read* methods.
+    #
+    # On modern systems like Linux, the "read" syscall cannot be interrupted
+    # when reading "fast" files like on-disk files. So the EINTR issue only
+    # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG)
+    # files approximately as "fast" files and use the fast (unsafe) code path,
+    # to minimize the performance impact.
+    if sys.version_info >= (2, 7, 4):
+        # fp.readline deals with EINTR correctly, use it as a workaround.
+        def _safeiterfile(fp):
+            return iter(fp.readline, '')
+    else:
+        # fp.read* are broken too, manually deal with EINTR in a stupid way.
+        # note: this may block longer than necessary because of bufsize.
+        def _safeiterfile(fp, bufsize=4096):
+            fd = fp.fileno()
+            line = ''
+            while True:
+                try:
+                    buf = os.read(fd, bufsize)
+                except OSError as ex:
+                    # os.read only raises EINTR before any data is read
+                    if ex.errno == errno.EINTR:
+                        continue
+                    else:
+                        raise
+                line += buf
+                if '\n' in buf:
+                    splitted = line.splitlines(True)
+                    line = ''
+                    for l in splitted:
+                        if l[-1] == '\n':
+                            yield l
+                        else:
+                            line = l
+                if not buf:
+                    break
+            if line:
+                yield line
+
+    def iterfile(fp):
+        fastpath = True
+        if type(fp) is file:
+            fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode)
+        if fastpath:
+            return fp
+        else:
+            return _safeiterfile(fp)
+else:
+    # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed.
+    def iterfile(fp):
+        return fp
 
 def iterlines(iterator):
     for chunk in iterator: