Mercurial > hg
view tests/svnxml.py @ 50400:95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Mon, 06 Mar 2023 11:27:57 +0000 |
parents | 6000f5b25c9b |
children |
line wrap: on
line source
# Read the output of a "svn log --xml" command on stdin, parse it and # print a subset of attributes common to all svn versions tested by # hg. import sys import xml.dom.minidom def xmltext(e): return ''.join(c.data for c in e.childNodes if c.nodeType == c.TEXT_NODE) def parseentry(entry): e = {} e['revision'] = entry.getAttribute('revision') e['author'] = xmltext(entry.getElementsByTagName('author')[0]) e['msg'] = xmltext(entry.getElementsByTagName('msg')[0]) e['date'] = xmltext(entry.getElementsByTagName('date')[0]) e['paths'] = [] paths = entry.getElementsByTagName('paths') if paths: paths = paths[0] for p in paths.getElementsByTagName('path'): action = p.getAttribute('action').encode('utf-8') path = xmltext(p).encode('utf-8') frompath = p.getAttribute('copyfrom-path').encode('utf-8') fromrev = p.getAttribute('copyfrom-rev').encode('utf-8') e['paths'].append((path, action, frompath, fromrev)) return e def parselog(data): entries = [] doc = xml.dom.minidom.parseString(data) for e in doc.getElementsByTagName('logentry'): entries.append(parseentry(e)) return entries def printentries(entries): try: fp = sys.stdout.buffer except AttributeError: fp = sys.stdout for e in entries: for k in ('revision', 'author', 'date', 'msg'): fp.write(('%s: %s\n' % (k, e[k])).encode('utf-8')) for path, action, fpath, frev in sorted(e['paths']): frominfo = b'' if frev: frominfo = b' (from %s@%s)' % (fpath, frev) p = b' %s %s%s\n' % (action, path, frominfo) fp.write(p) if __name__ == '__main__': data = sys.stdin.read() entries = parselog(data) printentries(entries)