comparison mercurial/util.py @ 30418:1156ec81f709

util: improve iterfile so it chooses code path wisely We have performance concerns on "iterfile" as it is 4X slower on normal files. While modern systems have the nice property that reading a "fast" (on-disk) file cannot be interrupted and should be made use of. This patch dumps the related knowledge in comments. And "iterfile" chooses code paths wisely: 1. If it's CPython 3, or PyPY, use the fast path. 2. If fp is a normal file, use the fast path. 3. If fp is not a normal file and CPython version >= 2.7.4, use the same workaround (4x slower) as before. 4. If fp is not a normal file and CPython version < 2.7.4, use another workaround (2x slower but may block longer then necessary) which basically re-invents the buffer + readline logic in Python. This will give us good confidence on both correctness and performance dealing with EINTR in iterfile(fp) for all known supported Python versions.
author Jun Wu <quark@fb.com>
date Tue, 15 Nov 2016 20:25:51 +0000
parents 854190becacb
children 64d7275445d0
comparison
equal deleted inserted replaced
30417:854190becacb 30418:1156ec81f709
22 import errno 22 import errno
23 import gc 23 import gc
24 import hashlib 24 import hashlib
25 import imp 25 import imp
26 import os 26 import os
27 import platform as pyplatform
27 import re as remod 28 import re as remod
28 import shutil 29 import shutil
29 import signal 30 import signal
30 import socket 31 import socket
32 import stat
31 import string 33 import string
32 import subprocess 34 import subprocess
33 import sys 35 import sys
34 import tempfile 36 import tempfile
35 import textwrap 37 import textwrap
2206 wrapper = MBTextWrapper(width=width, 2208 wrapper = MBTextWrapper(width=width,
2207 initial_indent=initindent, 2209 initial_indent=initindent,
2208 subsequent_indent=hangindent) 2210 subsequent_indent=hangindent)
2209 return wrapper.fill(line).encode(encoding.encoding) 2211 return wrapper.fill(line).encode(encoding.encoding)
2210 2212
2211 def iterfile(fp): 2213 if (pyplatform.python_implementation() == 'CPython' and
2212 """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is 2214 sys.version_info < (3, 0)):
2213 known to have such issues.""" 2215 # There is an issue in CPython that some IO methods do not handle EINTR
2214 return iter(fp.readline, '') 2216 # correctly. The following table shows what CPython version (and functions)
2217 # are affected (buggy: has the EINTR bug, okay: otherwise):
2218 #
2219 # | < 2.7.4 | 2.7.4 to 2.7.12 | >= 3.0
2220 # --------------------------------------------------
2221 # fp.__iter__ | buggy | buggy | okay
2222 # fp.read* | buggy | okay [1] | okay
2223 #
2224 # [1]: fixed by changeset 67dc99a989cd in the cpython hg repo.
2225 #
2226 # Here we workaround the EINTR issue for fileobj.__iter__. Other methods
2227 # like "read*" are ignored for now, as Python < 2.7.4 is a minority.
2228 #
2229 # Although we can workaround the EINTR issue for fp.__iter__, it is slower:
2230 # "for x in fp" is 4x faster than "for x in iter(fp.readline, '')" in
2231 # CPython 2, because CPython 2 maintains an internal readahead buffer for
2232 # fp.__iter__ but not other fp.read* methods.
2233 #
2234 # On modern systems like Linux, the "read" syscall cannot be interrupted
2235 # when reading "fast" files like on-disk files. So the EINTR issue only
2236 # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG)
2237 # files approximately as "fast" files and use the fast (unsafe) code path,
2238 # to minimize the performance impact.
2239 if sys.version_info >= (2, 7, 4):
2240 # fp.readline deals with EINTR correctly, use it as a workaround.
2241 def _safeiterfile(fp):
2242 return iter(fp.readline, '')
2243 else:
2244 # fp.read* are broken too, manually deal with EINTR in a stupid way.
2245 # note: this may block longer than necessary because of bufsize.
2246 def _safeiterfile(fp, bufsize=4096):
2247 fd = fp.fileno()
2248 line = ''
2249 while True:
2250 try:
2251 buf = os.read(fd, bufsize)
2252 except OSError as ex:
2253 # os.read only raises EINTR before any data is read
2254 if ex.errno == errno.EINTR:
2255 continue
2256 else:
2257 raise
2258 line += buf
2259 if '\n' in buf:
2260 splitted = line.splitlines(True)
2261 line = ''
2262 for l in splitted:
2263 if l[-1] == '\n':
2264 yield l
2265 else:
2266 line = l
2267 if not buf:
2268 break
2269 if line:
2270 yield line
2271
2272 def iterfile(fp):
2273 fastpath = True
2274 if type(fp) is file:
2275 fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode)
2276 if fastpath:
2277 return fp
2278 else:
2279 return _safeiterfile(fp)
2280 else:
2281 # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed.
2282 def iterfile(fp):
2283 return fp
2215 2284
2216 def iterlines(iterator): 2285 def iterlines(iterator):
2217 for chunk in iterator: 2286 for chunk in iterator:
2218 for line in chunk.splitlines(): 2287 for line in chunk.splitlines():
2219 yield line 2288 yield line