worker: avoid potential partial write of pickled data
Previously, the code wrote the pickled data using os.write(). However,
os.write() can write less bytes than passed to it. To trigger the problem, the
pickled data had to be larger than
2147479552 bytes on my system.
Instead, open a file object and pass it to pickle.dump(). This also has the
advantage that it doesn’t buffer the whole pickled data in memory.
Note that the opened file must be buffered because pickle doesn’t support
unbuffered streams because unbuffered streams’ write() method might write less
bytes than passed to it (like os.write()) but pickle.dump() relies on that all
bytes are written (see https://github.com/python/cpython/issues/93050).
The side effect of using a file object and a with statement is that wfd is
explicitly closed now while it seems like before it was implicitly closed by
process exit.
#!/usr/bin/env python3
#
# generate-branchy-bundle - generate a branch for a "large" branchy repository
#
# Copyright 2018 Octobus, contact@octobus.net
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
#
# This script generates a repository suitable for testing delta computation
# strategies.
#
# The repository update a single "large" file with many updates. One fixed part
# of the files always get updated while the rest of the lines get updated over
# time. This update happens over many topological branches, some getting merged
# back.
#
# Running with `chg` in your path and `CHGHG` set is recommended for speed.
import hashlib
import os
import shutil
import subprocess
import sys
import tempfile
BUNDLE_NAME = 'big-file-churn.hg'
# constants for generating the repository
NB_CHANGESET = 5000
PERIOD_MERGING = 8
PERIOD_BRANCHING = 7
MOVE_BACK_MIN = 3
MOVE_BACK_RANGE = 5
# constants for generating the large file we keep updating
#
# At each revision, the beginning on the file change,
# and set of other lines changes too.
FILENAME = 'SPARSE-REVLOG-TEST-FILE'
NB_LINES = 10500
ALWAYS_CHANGE_LINES = 500
OTHER_CHANGES = 300
def nextcontent(previous_content):
"""utility to produce a new file content from the previous one"""
return hashlib.md5(previous_content).hexdigest()
def filecontent(iteridx, oldcontent):
"""generate a new file content
The content is generated according the iteration index and previous
content"""
# initial call
if iteridx is None:
current = ''
else:
current = str(iteridx)
for idx in range(NB_LINES):
do_change_line = True
if oldcontent is not None and ALWAYS_CHANGE_LINES < idx:
do_change_line = not ((idx - iteridx) % OTHER_CHANGES)
if do_change_line:
to_write = current + '\n'
current = nextcontent(current)
else:
to_write = oldcontent[idx]
yield to_write
def updatefile(filename, idx):
"""update <filename> to be at appropriate content for iteration <idx>"""
existing = None
if idx is not None:
with open(filename, 'rb') as old:
existing = old.readlines()
with open(filename, 'wb') as target:
for line in filecontent(idx, existing):
target.write(line)
def hg(command, *args):
"""call a mercurial command with appropriate config and argument"""
env = os.environ.copy()
if 'CHGHG' in env:
full_cmd = ['chg']
else:
full_cmd = ['hg']
full_cmd.append('--quiet')
full_cmd.append(command)
if command == 'commit':
# reproducible commit metadata
full_cmd.extend(['--date', '0 0', '--user', 'test'])
elif command == 'merge':
# avoid conflicts by picking the local variant
full_cmd.extend(['--tool', ':merge-local'])
full_cmd.extend(args)
env['HGRCPATH'] = ''
return subprocess.check_call(full_cmd, env=env)
def run(target):
tmpdir = tempfile.mkdtemp(prefix='tmp-hg-test-big-file-bundle-')
try:
os.chdir(tmpdir)
hg('init')
updatefile(FILENAME, None)
hg('commit', '--addremove', '--message', 'initial commit')
for idx in range(1, NB_CHANGESET + 1):
if sys.stdout.isatty():
print("generating commit #%d/%d" % (idx, NB_CHANGESET))
if (idx % PERIOD_BRANCHING) == 0:
move_back = MOVE_BACK_MIN + (idx % MOVE_BACK_RANGE)
hg('update', ".~%d" % move_back)
if (idx % PERIOD_MERGING) == 0:
hg('merge', 'min(head())')
updatefile(FILENAME, idx)
hg('commit', '--message', 'commit #%d' % idx)
hg('bundle', '--all', target, '--config', 'devel.bundle.delta=p1')
with open(target, 'rb') as bundle:
data = bundle.read()
digest = hashlib.md5(data).hexdigest()
with open(target + '.md5', 'wb') as md5file:
md5file.write(digest + '\n')
if sys.stdout.isatty():
print('bundle generated at "%s" md5: %s' % (target, digest))
finally:
shutil.rmtree(tmpdir)
return 0
if __name__ == '__main__':
orig = os.path.realpath(os.path.dirname(sys.argv[0]))
target = os.path.join(orig, os.pardir, 'cache', BUNDLE_NAME)
sys.exit(run(target))