comparison contrib/testparseutil.py @ 40093:726cfc47f17a

contrib: add an utility module to parse test scripts This patch centralizes the logic to pick up code fragments embedded in *.t script, in order to: - apply checking with patterns in check-code.py on such embedded code fragments Now, check-code.py completely ignores embedded code fragments. I'll post another patch series to check them. - replace similar code path in contrib/import-checker.py Current import-checker.py has problems below. Fixing each of them is a little difficult, because parsing logic and pattern strings are tightly coupled. - overlook (or mis-detect) the end of inline script in doctest style 8a8dd6e4a97a fixed a part of this issue, but not enough. - it overlooks inline script in doctest style at the end of file (and ignores invalid un-closed heredoc at the end of file, too) - it overlooks code fragment in styles below - "python <<EOF" (heredoc should be "cat > file <<EOF" style) - "cat > foobar.py << ANYLIMIT" (limit mark should be "EOF") - "cat << EOF > foobar.py" (filename should be placed before limit mark) - "cat >> foobar.py << EOF" (appending is ignored) - it is not extensible for other than python code fragments (e.g. shell script, hgrc file, and so on) This new module can detect python code fragments in styles below: - inline script in doctest style (starting by " >>> " line) - python invocation with heredoc script ("python <<EOF") - python script in heredoc style (redirected into ".py" file) As an example of extensibility of new module, this patch also contains implementation to pick up code fragment below. This will be useful to add additional restriction for them, for example. - shell script in heredoc style (redirected into ".sh" file) - hgrc configuration in heredoc style (redirected into hgrc or $HGRCPATH)
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Thu, 23 Aug 2018 12:25:54 +0900
parents
children 99b4c6d73a72
comparison
equal deleted inserted replaced
40092:58786930ea27 40093:726cfc47f17a
1 # testparseutil.py - utilities to parse test script for check tools
2 #
3 # Copyright 2018 FUJIWARA Katsunori <foozy@lares.dti.ne.jp> and others
4 #
5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version.
7
8 from __future__ import absolute_import, print_function
9
10 import abc
11 import re
12 import sys
13
14 ####################
15 # for Python3 compatibility (almost comes from mercurial/pycompat.py)
16
17 ispy3 = (sys.version_info[0] >= 3)
18
19 def identity(a):
20 return a
21
22 def _rapply(f, xs):
23 if xs is None:
24 # assume None means non-value of optional data
25 return xs
26 if isinstance(xs, (list, set, tuple)):
27 return type(xs)(_rapply(f, x) for x in xs)
28 if isinstance(xs, dict):
29 return type(xs)((_rapply(f, k), _rapply(f, v)) for k, v in xs.items())
30 return f(xs)
31
32 def rapply(f, xs):
33 if f is identity:
34 # fast path mainly for py2
35 return xs
36 return _rapply(f, xs)
37
38 if ispy3:
39 import builtins
40
41 # TODO: .buffer might not exist if std streams were replaced; we'll need
42 # a silly wrapper to make a bytes stream backed by a unicode one.
43 stdin = sys.stdin.buffer
44 stdout = sys.stdout.buffer
45 stderr = sys.stderr.buffer
46
47 def bytestr(s):
48 # tiny version of pycompat.bytestr
49 return s.encode('latin1')
50
51 def sysstr(s):
52 if isinstance(s, builtins.str):
53 return s
54 return s.decode(u'latin-1')
55
56 def opentext(f):
57 return open(f, 'rb')
58 else:
59 stdin = sys.stdin
60 stdout = sys.stdout
61 stderr = sys.stderr
62
63 bytestr = str
64 sysstr = identity
65
66 opentext = open
67
68 def b2s(x):
69 # convert BYTES elements in "x" to SYSSTR recursively
70 return rapply(sysstr, x)
71
72 def writeout(data):
73 # write "data" in BYTES into stdout
74 stdout.write(data)
75
76 def writeerr(data):
77 # write "data" in BYTES into stderr
78 stderr.write(data)
79
80 ####################
81
82 class embeddedmatcher(object):
83 """Base class to detect embedded code fragments in *.t test script
84 """
85 __metaclass__ = abc.ABCMeta
86
87 def __init__(self, desc):
88 self.desc = desc
89
90 @abc.abstractmethod
91 def startsat(self, line):
92 """Examine whether embedded code starts at line
93
94 This can return arbitrary object, and it is used as 'ctx' for
95 subsequent method invocations.
96 """
97
98 @abc.abstractmethod
99 def endsat(self, ctx, line):
100 """Examine whether embedded code ends at line"""
101
102 @abc.abstractmethod
103 def isinside(self, ctx, line):
104 """Examine whether line is inside embedded code, if not yet endsat
105 """
106
107 @abc.abstractmethod
108 def ignores(self, ctx):
109 """Examine whether detected embedded code should be ignored"""
110
111 @abc.abstractmethod
112 def filename(self, ctx):
113 """Return filename of embedded code
114
115 If filename isn't specified for embedded code explicitly, this
116 returns None.
117 """
118
119 @abc.abstractmethod
120 def codeatstart(self, ctx, line):
121 """Return actual code at the start line of embedded code
122
123 This might return None, if the start line doesn't contain
124 actual code.
125 """
126
127 @abc.abstractmethod
128 def codeatend(self, ctx, line):
129 """Return actual code at the end line of embedded code
130
131 This might return None, if the end line doesn't contain actual
132 code.
133 """
134
135 @abc.abstractmethod
136 def codeinside(self, ctx, line):
137 """Return actual code at line inside embedded code"""
138
139 def embedded(basefile, lines, errors, matchers):
140 """pick embedded code fragments up from given lines
141
142 This is common parsing logic, which examines specified matchers on
143 given lines.
144
145 :basefile: a name of a file, from which lines to be parsed come.
146 :lines: to be parsed (might be a value returned by "open(basefile)")
147 :errors: an array, into which messages for detected error are stored
148 :matchers: an array of embeddedmatcher objects
149
150 This function yields '(filename, starts, ends, code)' tuple.
151
152 :filename: a name of embedded code, if it is explicitly specified
153 (e.g. "foobar" of "cat >> foobar <<EOF").
154 Otherwise, this is None
155 :starts: line number (1-origin), at which embedded code starts (inclusive)
156 :ends: line number (1-origin), at which embedded code ends (exclusive)
157 :code: extracted embedded code, which is single-stringified
158
159 >>> class ambigmatcher(object):
160 ... # mock matcher class to examine implementation of
161 ... # "ambiguous matching" corner case
162 ... def __init__(self, desc, matchfunc):
163 ... self.desc = desc
164 ... self.matchfunc = matchfunc
165 ... def startsat(self, line):
166 ... return self.matchfunc(line)
167 >>> ambig1 = ambigmatcher(b'ambiguous #1',
168 ... lambda l: l.startswith(b' $ cat '))
169 >>> ambig2 = ambigmatcher(b'ambiguous #2',
170 ... lambda l: l.endswith(b'<< EOF\\n'))
171 >>> lines = [b' $ cat > foo.py << EOF\\n']
172 >>> errors = []
173 >>> matchers = [ambig1, ambig2]
174 >>> list(t for t in embedded(b'<dummy>', lines, errors, matchers))
175 []
176 >>> b2s(errors)
177 ['<dummy>:1: ambiguous line for "ambiguous #1", "ambiguous #2"']
178
179 """
180 matcher = None
181 ctx = filename = code = startline = None # for pyflakes
182
183 for lineno, line in enumerate(lines, 1):
184 if not line.endswith(b'\n'):
185 line += b'\n' # to normalize EOF line
186 if matcher: # now, inside embedded code
187 if matcher.endsat(ctx, line):
188 codeatend = matcher.codeatend(ctx, line)
189 if codeatend is not None:
190 code.append(codeatend)
191 if not matcher.ignores(ctx):
192 yield (filename, startline, lineno, b''.join(code))
193 matcher = None
194 # DO NOT "continue", because line might start next fragment
195 elif not matcher.isinside(ctx, line):
196 # this is an error of basefile
197 # (if matchers are implemented correctly)
198 errors.append(b'%s:%d: unexpected line for "%s"'
199 % (basefile, lineno, matcher.desc))
200 # stop extracting embedded code by current 'matcher',
201 # because appearance of unexpected line might mean
202 # that expected end-of-embedded-code line might never
203 # appear
204 matcher = None
205 # DO NOT "continue", because line might start next fragment
206 else:
207 code.append(matcher.codeinside(ctx, line))
208 continue
209
210 # examine whether current line starts embedded code or not
211 assert not matcher
212
213 matched = []
214 for m in matchers:
215 ctx = m.startsat(line)
216 if ctx:
217 matched.append((m, ctx))
218 if matched:
219 if len(matched) > 1:
220 # this is an error of matchers, maybe
221 errors.append(b'%s:%d: ambiguous line for %s' %
222 (basefile, lineno,
223 b', '.join([b'"%s"' % m.desc
224 for m, c in matched])))
225 # omit extracting embedded code, because choosing
226 # arbitrary matcher from matched ones might fail to
227 # detect the end of embedded code as expected.
228 continue
229 matcher, ctx = matched[0]
230 filename = matcher.filename(ctx)
231 code = []
232 codeatstart = matcher.codeatstart(ctx, line)
233 if codeatstart is not None:
234 code.append(codeatstart)
235 startline = lineno
236 else:
237 startline = lineno + 1
238
239 if matcher:
240 # examine whether EOF ends embedded code, because embedded
241 # code isn't yet ended explicitly
242 if matcher.endsat(ctx, b'\n'):
243 codeatend = matcher.codeatend(ctx, b'\n')
244 if codeatend is not None:
245 code.append(codeatend)
246 if not matcher.ignores(ctx):
247 yield (filename, startline, lineno + 1, b''.join(code))
248 else:
249 # this is an error of basefile
250 # (if matchers are implemented correctly)
251 errors.append(b'%s:%d: unexpected end of file for "%s"'
252 % (basefile, lineno, matcher.desc))
253
254 # heredoc limit mark to ignore embedded code at check-code.py or so
255 heredocignorelimit = b'NO_CHECK_EOF'
256
257 # the pattern to match against cases below, and to return a limit mark
258 # string as 'lname' group
259 #
260 # - << LIMITMARK
261 # - << "LIMITMARK"
262 # - << 'LIMITMARK'
263 heredoclimitpat = br'\s*<<\s*(?P<lquote>["\']?)(?P<limit>\w+)(?P=lquote)'
264
265 class fileheredocmatcher(embeddedmatcher):
266 """Detect "cat > FILE << LIMIT" style embedded code
267
268 >>> matcher = fileheredocmatcher(b'heredoc .py file', br'[^<]+\.py')
269 >>> b2s(matcher.startsat(b' $ cat > file.py << EOF\\n'))
270 ('file.py', ' > EOF\\n')
271 >>> b2s(matcher.startsat(b' $ cat >>file.py <<EOF\\n'))
272 ('file.py', ' > EOF\\n')
273 >>> b2s(matcher.startsat(b' $ cat> \\x27any file.py\\x27<< "EOF"\\n'))
274 ('any file.py', ' > EOF\\n')
275 >>> b2s(matcher.startsat(b" $ cat > file.py << 'ANYLIMIT'\\n"))
276 ('file.py', ' > ANYLIMIT\\n')
277 >>> b2s(matcher.startsat(b' $ cat<<ANYLIMIT>"file.py"\\n'))
278 ('file.py', ' > ANYLIMIT\\n')
279 >>> start = b' $ cat > file.py << EOF\\n'
280 >>> ctx = matcher.startsat(start)
281 >>> matcher.codeatstart(ctx, start)
282 >>> b2s(matcher.filename(ctx))
283 'file.py'
284 >>> matcher.ignores(ctx)
285 False
286 >>> inside = b' > foo = 1\\n'
287 >>> matcher.endsat(ctx, inside)
288 False
289 >>> matcher.isinside(ctx, inside)
290 True
291 >>> b2s(matcher.codeinside(ctx, inside))
292 'foo = 1\\n'
293 >>> end = b' > EOF\\n'
294 >>> matcher.endsat(ctx, end)
295 True
296 >>> matcher.codeatend(ctx, end)
297 >>> matcher.endsat(ctx, b' > EOFEOF\\n')
298 False
299 >>> ctx = matcher.startsat(b' $ cat > file.py << NO_CHECK_EOF\\n')
300 >>> matcher.ignores(ctx)
301 True
302 """
303 _prefix = b' > '
304
305 def __init__(self, desc, namepat):
306 super(fileheredocmatcher, self).__init__(desc)
307
308 # build the pattern to match against cases below (and ">>"
309 # variants), and to return a target filename string as 'name'
310 # group
311 #
312 # - > NAMEPAT
313 # - > "NAMEPAT"
314 # - > 'NAMEPAT'
315 namepat = (br'\s*>>?\s*(?P<nquote>["\']?)(?P<name>%s)(?P=nquote)'
316 % namepat)
317 self._fileres = [
318 # "cat > NAME << LIMIT" case
319 re.compile(br' \$ \s*cat' + namepat + heredoclimitpat),
320 # "cat << LIMIT > NAME" case
321 re.compile(br' \$ \s*cat' + heredoclimitpat + namepat),
322 ]
323
324 def startsat(self, line):
325 # ctx is (filename, END-LINE-OF-EMBEDDED-CODE) tuple
326 for filere in self._fileres:
327 matched = filere.match(line)
328 if matched:
329 return (matched.group('name'),
330 b' > %s\n' % matched.group('limit'))
331
332 def endsat(self, ctx, line):
333 return ctx[1] == line
334
335 def isinside(self, ctx, line):
336 return line.startswith(self._prefix)
337
338 def ignores(self, ctx):
339 return b' > %s\n' % heredocignorelimit == ctx[1]
340
341 def filename(self, ctx):
342 return ctx[0]
343
344 def codeatstart(self, ctx, line):
345 return None # no embedded code at start line
346
347 def codeatend(self, ctx, line):
348 return None # no embedded code at end line
349
350 def codeinside(self, ctx, line):
351 return line[len(self._prefix):] # strip prefix
352
353 ####
354 # for embedded python script
355
356 class pydoctestmatcher(embeddedmatcher):
357 """Detect ">>> code" style embedded python code
358
359 >>> matcher = pydoctestmatcher()
360 >>> startline = b' >>> foo = 1\\n'
361 >>> matcher.startsat(startline)
362 True
363 >>> matcher.startsat(b' ... foo = 1\\n')
364 False
365 >>> ctx = matcher.startsat(startline)
366 >>> matcher.filename(ctx)
367 >>> matcher.ignores(ctx)
368 False
369 >>> b2s(matcher.codeatstart(ctx, startline))
370 'foo = 1\\n'
371 >>> inside = b' >>> foo = 1\\n'
372 >>> matcher.endsat(ctx, inside)
373 False
374 >>> matcher.isinside(ctx, inside)
375 True
376 >>> b2s(matcher.codeinside(ctx, inside))
377 'foo = 1\\n'
378 >>> inside = b' ... foo = 1\\n'
379 >>> matcher.endsat(ctx, inside)
380 False
381 >>> matcher.isinside(ctx, inside)
382 True
383 >>> b2s(matcher.codeinside(ctx, inside))
384 'foo = 1\\n'
385 >>> inside = b' expected output\\n'
386 >>> matcher.endsat(ctx, inside)
387 False
388 >>> matcher.isinside(ctx, inside)
389 True
390 >>> b2s(matcher.codeinside(ctx, inside))
391 '\\n'
392 >>> inside = b' \\n'
393 >>> matcher.endsat(ctx, inside)
394 False
395 >>> matcher.isinside(ctx, inside)
396 True
397 >>> b2s(matcher.codeinside(ctx, inside))
398 '\\n'
399 >>> end = b' $ foo bar\\n'
400 >>> matcher.endsat(ctx, end)
401 True
402 >>> matcher.codeatend(ctx, end)
403 >>> end = b'\\n'
404 >>> matcher.endsat(ctx, end)
405 True
406 >>> matcher.codeatend(ctx, end)
407 """
408 _prefix = b' >>> '
409 _prefixre = re.compile(br' (>>>|\.\.\.) ')
410
411 # If a line matches against not _prefixre but _outputre, that line
412 # is "an expected output line" (= not a part of code fragment).
413 #
414 # Strictly speaking, a line matching against "(#if|#else|#endif)"
415 # is also treated similarly in "inline python code" semantics by
416 # run-tests.py. But "directive line inside inline python code"
417 # should be rejected by Mercurial reviewers. Therefore, this
418 # regexp does not matche against such directive lines.
419 _outputre = re.compile(br' $| [^$]')
420
421 def __init__(self):
422 super(pydoctestmatcher, self).__init__(b"doctest style python code")
423
424 def startsat(self, line):
425 # ctx is "True"
426 return line.startswith(self._prefix)
427
428 def endsat(self, ctx, line):
429 return not (self._prefixre.match(line) or self._outputre.match(line))
430
431 def isinside(self, ctx, line):
432 return True # always true, if not yet ended
433
434 def ignores(self, ctx):
435 return False # should be checked always
436
437 def filename(self, ctx):
438 return None # no filename
439
440 def codeatstart(self, ctx, line):
441 return line[len(self._prefix):] # strip prefix ' >>> '/' ... '
442
443 def codeatend(self, ctx, line):
444 return None # no embedded code at end line
445
446 def codeinside(self, ctx, line):
447 if self._prefixre.match(line):
448 return line[len(self._prefix):] # strip prefix ' >>> '/' ... '
449 return b'\n' # an expected output line is treated as an empty line
450
451 class pyheredocmatcher(embeddedmatcher):
452 """Detect "python << LIMIT" style embedded python code
453
454 >>> matcher = pyheredocmatcher()
455 >>> b2s(matcher.startsat(b' $ python << EOF\\n'))
456 ' > EOF\\n'
457 >>> b2s(matcher.startsat(b' $ $PYTHON <<EOF\\n'))
458 ' > EOF\\n'
459 >>> b2s(matcher.startsat(b' $ "$PYTHON"<< "EOF"\\n'))
460 ' > EOF\\n'
461 >>> b2s(matcher.startsat(b" $ $PYTHON << 'ANYLIMIT'\\n"))
462 ' > ANYLIMIT\\n'
463 >>> matcher.startsat(b' $ "$PYTHON" < EOF\\n')
464 >>> start = b' $ python << EOF\\n'
465 >>> ctx = matcher.startsat(start)
466 >>> matcher.codeatstart(ctx, start)
467 >>> matcher.filename(ctx)
468 >>> matcher.ignores(ctx)
469 False
470 >>> inside = b' > foo = 1\\n'
471 >>> matcher.endsat(ctx, inside)
472 False
473 >>> matcher.isinside(ctx, inside)
474 True
475 >>> b2s(matcher.codeinside(ctx, inside))
476 'foo = 1\\n'
477 >>> end = b' > EOF\\n'
478 >>> matcher.endsat(ctx, end)
479 True
480 >>> matcher.codeatend(ctx, end)
481 >>> matcher.endsat(ctx, b' > EOFEOF\\n')
482 False
483 >>> ctx = matcher.startsat(b' $ python << NO_CHECK_EOF\\n')
484 >>> matcher.ignores(ctx)
485 True
486 """
487 _prefix = b' > '
488
489 _startre = re.compile(br' \$ (\$PYTHON|"\$PYTHON"|python).*' +
490 heredoclimitpat)
491
492 def __init__(self):
493 super(pyheredocmatcher, self).__init__(b"heredoc python invocation")
494
495 def startsat(self, line):
496 # ctx is END-LINE-OF-EMBEDDED-CODE
497 matched = self._startre.match(line)
498 if matched:
499 return b' > %s\n' % matched.group('limit')
500
501 def endsat(self, ctx, line):
502 return ctx == line
503
504 def isinside(self, ctx, line):
505 return line.startswith(self._prefix)
506
507 def ignores(self, ctx):
508 return b' > %s\n' % heredocignorelimit == ctx
509
510 def filename(self, ctx):
511 return None # no filename
512
513 def codeatstart(self, ctx, line):
514 return None # no embedded code at start line
515
516 def codeatend(self, ctx, line):
517 return None # no embedded code at end line
518
519 def codeinside(self, ctx, line):
520 return line[len(self._prefix):] # strip prefix
521
522 _pymatchers = [
523 pydoctestmatcher(),
524 pyheredocmatcher(),
525 # use '[^<]+' instead of '\S+', in order to match against
526 # paths including whitespaces
527 fileheredocmatcher(b'heredoc .py file', br'[^<]+\.py'),
528 ]
529
530 def pyembedded(basefile, lines, errors):
531 return embedded(basefile, lines, errors, _pymatchers)
532
533 ####
534 # for embedded shell script
535
536 _shmatchers = [
537 # use '[^<]+' instead of '\S+', in order to match against
538 # paths including whitespaces
539 fileheredocmatcher(b'heredoc .sh file', br'[^<]+\.sh'),
540 ]
541
542 def shembedded(basefile, lines, errors):
543 return embedded(basefile, lines, errors, _shmatchers)
544
545 ####
546 # for embedded hgrc configuration
547
548 _hgrcmatchers = [
549 # use '[^<]+' instead of '\S+', in order to match against
550 # paths including whitespaces
551 fileheredocmatcher(b'heredoc hgrc file',
552 br'(([^/<]+/)+hgrc|\$HGRCPATH|\${HGRCPATH})'),
553 ]
554
555 def hgrcembedded(basefile, lines, errors):
556 return embedded(basefile, lines, errors, _hgrcmatchers)
557
558 ####
559
560 if __name__ == "__main__":
561 import optparse
562 import sys
563
564 def showembedded(basefile, lines, embeddedfunc, opts):
565 errors = []
566 for name, starts, ends, code in embeddedfunc(basefile, lines, errors):
567 if not name:
568 name = b'<anonymous>'
569 writeout(b"%s:%d: %s starts\n" % (basefile, starts, name))
570 if opts.verbose and code:
571 writeout(b" |%s\n" %
572 b"\n |".join(l for l in code.splitlines()))
573 writeout(b"%s:%d: %s ends\n" % (basefile, ends, name))
574 for e in errors:
575 writeerr(b"%s\n" % e)
576 return len(errors)
577
578 def applyembedded(args, embeddedfunc, opts):
579 ret = 0
580 if args:
581 for f in args:
582 with opentext(f) as fp:
583 if showembedded(bytestr(f), fp, embeddedfunc, opts):
584 ret = 1
585 else:
586 lines = [l for l in stdin.readlines()]
587 if showembedded(b'<stdin>', lines, embeddedfunc, opts):
588 ret = 1
589 return ret
590
591 commands = {}
592 def command(name, desc):
593 def wrap(func):
594 commands[name] = (desc, func)
595 return wrap
596
597 @command("pyembedded", "detect embedded python script")
598 def pyembeddedcmd(args, opts):
599 return applyembedded(args, pyembedded, opts)
600
601 @command("shembedded", "detect embedded shell script")
602 def shembeddedcmd(args, opts):
603 return applyembedded(args, shembedded, opts)
604
605 @command("hgrcembedded", "detect embedded hgrc configuration")
606 def hgrcembeddedcmd(args, opts):
607 return applyembedded(args, hgrcembedded, opts)
608
609 availablecommands = "\n".join([" - %s: %s" % (key, value[0])
610 for key, value in commands.items()])
611
612 parser = optparse.OptionParser("""%prog COMMAND [file ...]
613
614 Pick up embedded code fragments from given file(s) or stdin, and list
615 up start/end lines of them in standard compiler format
616 ("FILENAME:LINENO:").
617
618 Available commands are:
619 """ + availablecommands + """
620 """)
621 parser.add_option("-v", "--verbose",
622 help="enable additional output (e.g. actual code)",
623 action="store_true")
624 (opts, args) = parser.parse_args()
625
626 if not args or args[0] not in commands:
627 parser.print_help()
628 sys.exit(255)
629
630 sys.exit(commands[args[0]][1](args[1:], opts))