comparison mercurial/patch.py @ 46020:210f9b8d7bbd stable

diff: do not concatenate immutable bytes while building a/b bodies (issue6445) Use bytearray instead. I don't know what's changed since Python 2, but bytes concatenation is 100x slow on Python 3. % python2.7 -m timeit -s "s = b''" "for i in range(10000): s += b'line'" 1000 loops, best of 3: 321 usec per loop % python3.9 -m timeit -s "s = b''" "for i in range(10000): s += b'line'" 5 loops, best of 5: 39.2 msec per loop Benchmark using tailwind.css (measuring the fast path, a is empty): % HGRCPATH=/dev/null python2.7 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 1.580 secs (user 1.560+0.000 sys 0.020+0.000) (this) time: real 1.610 secs (user 1.570+0.000 sys 0.030+0.000) % HGRCPATH=/dev/null python3.9 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 114.500 secs (user 114.460+0.000 sys 0.030+0.000) (this) time: real 2.180 secs (user 2.140+0.000 sys 0.040+0.000) Benchmark using random tabular text data (not the fast path): % dd if=/dev/urandom bs=1k count=1000 | hexdump -v -e '16/1 "%3u," "\n"' > ttf % hg ci -ma % dd if=/dev/urandom bs=1k count=1000 | hexdump -v -e '16/1 "%3u," "\n"' > ttf % hg ci -mb % HGRCPATH=/dev/null python2.7 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 3.240 secs (user 3.040+0.000 sys 0.200+0.000 (this) time: real 3.230 secs (user 3.070+0.000 sys 0.160+0.000) % HGRCPATH=/dev/null python3.9 ./hg log -R /tmp/issue6445 -p --time \ --color=always --config diff.word-diff=true >/dev/null (prev) time: real 44.130 secs (user 43.850+0.000 sys 0.270+0.000) (this) time: real 4.170 secs (user 3.850+0.000 sys 0.310+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sat, 28 Nov 2020 11:15:54 +0900
parents 10f48720ef95
children 2cf61e66c6d0
comparison
equal deleted inserted replaced
46019:fdd54a876213 46020:210f9b8d7bbd
2729 2729
2730 2730
2731 def diffsinglehunkinline(hunklines): 2731 def diffsinglehunkinline(hunklines):
2732 """yield tokens for a list of lines in a single hunk, with inline colors""" 2732 """yield tokens for a list of lines in a single hunk, with inline colors"""
2733 # prepare deleted, and inserted content 2733 # prepare deleted, and inserted content
2734 a = b'' 2734 a = bytearray()
2735 b = b'' 2735 b = bytearray()
2736 for line in hunklines: 2736 for line in hunklines:
2737 if line[0:1] == b'-': 2737 if line[0:1] == b'-':
2738 a += line[1:] 2738 a += line[1:]
2739 elif line[0:1] == b'+': 2739 elif line[0:1] == b'+':
2740 b += line[1:] 2740 b += line[1:]
2744 if not a or not b: 2744 if not a or not b:
2745 for t in diffsinglehunk(hunklines): 2745 for t in diffsinglehunk(hunklines):
2746 yield t 2746 yield t
2747 return 2747 return
2748 # re-split the content into words 2748 # re-split the content into words
2749 al = wordsplitter.findall(a) 2749 al = wordsplitter.findall(bytes(a))
2750 bl = wordsplitter.findall(b) 2750 bl = wordsplitter.findall(bytes(b))
2751 # re-arrange the words to lines since the diff algorithm is line-based 2751 # re-arrange the words to lines since the diff algorithm is line-based
2752 aln = [s if s == b'\n' else s + b'\n' for s in al] 2752 aln = [s if s == b'\n' else s + b'\n' for s in al]
2753 bln = [s if s == b'\n' else s + b'\n' for s in bl] 2753 bln = [s if s == b'\n' else s + b'\n' for s in bl]
2754 an = b''.join(aln) 2754 an = b''.join(aln)
2755 bn = b''.join(bln) 2755 bn = b''.join(bln)