comparison mercurial/server.py @ 37212:f09a2eab11cf

server: add an error feedback mechanism for when the daemon fails to launch There's a recurring problem on Windows where `hg serve -d` will randomly fail to spawn a detached process. The reason for the failure is completely hidden, and it takes hours to get a single failure on my laptop. All this does is redirect stdout/stderr of the child to a file until the lock file is freed, and then the parent dumps it out if it fails to spawn. I chose to put the output into the lock file because that is always cleaned up. There's no way to report errors after that anyway. On Windows, killdaemons.py is roughly `kill -9`, so this ensures that junk won't pile up. This may end up being a case of EADDRINUSE. At least that's what I saw spit out a few times (among other odd errors and missing output on Windows). But I also managed to get the same thing on Fedora 26 by running test-hgwebdir.t with --loop -j10 for several hours. Running `netstat` immediately after killing that run printed a wall of sockets in the TIME_WAIT state, which were gone a couple seconds later. I couldn't match up ports that failed, because --loop doesn't print out the message about the port that was used. So maybe the fix is to rotate the use of HGPORT[12] in the tests. But, let's collect some more data first.
author Matt Harbison <matt_harbison@yahoo.com>
date Wed, 28 Mar 2018 00:11:09 -0400
parents a8a902d7176e
children 73a60281a861
comparison
equal deleted inserted replaced
37211:77f9e95fe3c4 37212:f09a2eab11cf
27 ) 27 )
28 28
29 def runservice(opts, parentfn=None, initfn=None, runfn=None, logfile=None, 29 def runservice(opts, parentfn=None, initfn=None, runfn=None, logfile=None,
30 runargs=None, appendpid=False): 30 runargs=None, appendpid=False):
31 '''Run a command as a service.''' 31 '''Run a command as a service.'''
32
33 # When daemonized on Windows, redirect stdout/stderr to the lockfile (which
34 # gets cleaned up after the child is up and running), so that the parent can
35 # read and print the error if this child dies early. See 594dd384803c. On
36 # other platforms, the child can write to the parent's stdio directly, until
37 # it is redirected prior to runfn().
38 if pycompat.iswindows and opts['daemon_postexec']:
39 for inst in opts['daemon_postexec']:
40 if inst.startswith('unlink:'):
41 lockpath = inst[7:]
42 if os.path.exists(lockpath):
43 procutil.stdout.flush()
44 procutil.stderr.flush()
45
46 fd = os.open(lockpath,
47 os.O_WRONLY | os.O_APPEND | os.O_BINARY)
48 try:
49 os.dup2(fd, 1)
50 os.dup2(fd, 2)
51 finally:
52 os.close(fd)
32 53
33 def writepid(pid): 54 def writepid(pid):
34 if opts['pid_file']: 55 if opts['pid_file']:
35 if appendpid: 56 if appendpid:
36 mode = 'ab' 57 mode = 'ab'
59 break 80 break
60 def condfn(): 81 def condfn():
61 return not os.path.exists(lockpath) 82 return not os.path.exists(lockpath)
62 pid = procutil.rundetached(runargs, condfn) 83 pid = procutil.rundetached(runargs, condfn)
63 if pid < 0: 84 if pid < 0:
85 # If the daemonized process managed to write out an error msg,
86 # report it.
87 if pycompat.iswindows and os.path.exists(lockpath):
88 with open(lockpath) as log:
89 for line in log:
90 procutil.stderr.write(line)
64 raise error.Abort(_('child process failed to start')) 91 raise error.Abort(_('child process failed to start'))
65 writepid(pid) 92 writepid(pid)
66 finally: 93 finally:
67 util.tryunlink(lockpath) 94 util.tryunlink(lockpath)
68 if parentfn: 95 if parentfn:
79 if opts['daemon_postexec']: 106 if opts['daemon_postexec']:
80 try: 107 try:
81 os.setsid() 108 os.setsid()
82 except AttributeError: 109 except AttributeError:
83 pass 110 pass
111
112 lockpath = None
84 for inst in opts['daemon_postexec']: 113 for inst in opts['daemon_postexec']:
85 if inst.startswith('unlink:'): 114 if inst.startswith('unlink:'):
86 lockpath = inst[7:] 115 lockpath = inst[7:]
87 os.unlink(lockpath)
88 elif inst.startswith('chdir:'): 116 elif inst.startswith('chdir:'):
89 os.chdir(inst[6:]) 117 os.chdir(inst[6:])
90 elif inst != 'none': 118 elif inst != 'none':
91 raise error.Abort(_('invalid value for --daemon-postexec: %s') 119 raise error.Abort(_('invalid value for --daemon-postexec: %s')
92 % inst) 120 % inst)
105 if nullfd not in (0, 1, 2): 133 if nullfd not in (0, 1, 2):
106 os.close(nullfd) 134 os.close(nullfd)
107 if logfile and logfilefd not in (0, 1, 2): 135 if logfile and logfilefd not in (0, 1, 2):
108 os.close(logfilefd) 136 os.close(logfilefd)
109 137
138 # Only unlink after redirecting stdout/stderr, so Windows doesn't
139 # complain about a sharing violation.
140 if lockpath:
141 os.unlink(lockpath)
142
110 if runfn: 143 if runfn:
111 return runfn() 144 return runfn()
112 145
113 _cmdservicemap = { 146 _cmdservicemap = {
114 'chgunix': chgserver.chgunixservice, 147 'chgunix': chgserver.chgunixservice,