tests/svnxml.py
author Jun Wu <quark@fb.com>
Tue, 15 Nov 2016 20:25:51 +0000
changeset 30428 1156ec81f709
parent 28947 812eb3b7dc43
child 40216 c17d73bf6a4d
permissions -rw-r--r--
util: improve iterfile so it chooses code path wisely We have performance concerns on "iterfile" as it is 4X slower on normal files. While modern systems have the nice property that reading a "fast" (on-disk) file cannot be interrupted and should be made use of. This patch dumps the related knowledge in comments. And "iterfile" chooses code paths wisely: 1. If it's CPython 3, or PyPY, use the fast path. 2. If fp is a normal file, use the fast path. 3. If fp is not a normal file and CPython version >= 2.7.4, use the same workaround (4x slower) as before. 4. If fp is not a normal file and CPython version < 2.7.4, use another workaround (2x slower but may block longer then necessary) which basically re-invents the buffer + readline logic in Python. This will give us good confidence on both correctness and performance dealing with EINTR in iterfile(fp) for all known supported Python versions.

# Read the output of a "svn log --xml" command on stdin, parse it and
# print a subset of attributes common to all svn versions tested by
# hg.
from __future__ import absolute_import
import sys
import xml.dom.minidom

def xmltext(e):
    return ''.join(c.data for c
                   in e.childNodes
                   if c.nodeType == c.TEXT_NODE)

def parseentry(entry):
    e = {}
    e['revision'] = entry.getAttribute('revision')
    e['author'] = xmltext(entry.getElementsByTagName('author')[0])
    e['msg'] = xmltext(entry.getElementsByTagName('msg')[0])
    e['paths'] = []
    paths = entry.getElementsByTagName('paths')
    if paths:
        paths = paths[0]
        for p in paths.getElementsByTagName('path'):
            action = p.getAttribute('action')
            path = xmltext(p)
            frompath = p.getAttribute('copyfrom-path')
            fromrev = p.getAttribute('copyfrom-rev')
            e['paths'].append((path, action, frompath, fromrev))
    return e

def parselog(data):
    entries = []
    doc = xml.dom.minidom.parseString(data)
    for e in doc.getElementsByTagName('logentry'):
        entries.append(parseentry(e))
    return entries

def printentries(entries):
    fp = sys.stdout
    for e in entries:
        for k in ('revision', 'author', 'msg'):
            fp.write(('%s: %s\n' % (k, e[k])).encode('utf-8'))
        for path, action, fpath, frev in sorted(e['paths']):
            frominfo = ''
            if frev:
                frominfo = ' (from %s@%s)' % (fpath, frev)
            p = ' %s %s%s\n' % (action, path, frominfo)
            fp.write(p.encode('utf-8'))

if __name__ == '__main__':
    data = sys.stdin.read()
    entries = parselog(data)
    printentries(entries)