Mercurial > hg
changeset 9156:c9c7e8cdac9c
minimal reStructuredText parser
author | Martin Geisler <mg@lazybytes.net> |
---|---|
date | Thu, 16 Jul 2009 23:25:25 +0200 |
parents | b46063eabe98 |
children | 9261667e9b82 |
files | mercurial/minirst.py tests/test-minirst.py tests/test-minirst.py.out |
diffstat | 3 files changed, 646 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/minirst.py Thu Jul 16 23:25:25 2009 +0200 @@ -0,0 +1,299 @@ +# minirst.py - minimal reStructuredText parser +# +# Copyright 2009 Matt Mackall <mpm@selenic.com> and others +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2, incorporated herein by reference. + +"""simplified reStructuredText parser. + +This parser knows just enough about reStructuredText to parse the +Mercurial docstrings. + +It cheats in a major way: nested blocks are not really nested. They +are just indented blocks that look like they are nested. This relies +on the user to keep the right indentation for the blocks. + +It only supports a small subset of reStructuredText: + +- paragraphs + +- definition lists (must use ' ' to indent definitions) + +- lists (items must start with '-') + +- literal blocks + +- option lists (supports only long options without arguments) + +- inline markup is not recognized at all. +""" + +import re, sys, textwrap + + +def findblocks(text): + """Find continuous blocks of lines in text. + + Returns a list of dictionaries representing the blocks. Each block + has an 'indent' field and a 'lines' field. + """ + blocks = [[]] + lines = text.splitlines() + for line in lines: + if line.strip(): + blocks[-1].append(line) + elif blocks[-1]: + blocks.append([]) + if not blocks[-1]: + del blocks[-1] + + for i, block in enumerate(blocks): + indent = min((len(l) - len(l.lstrip())) for l in block) + blocks[i] = dict(indent=indent, lines=[l[indent:] for l in block]) + return blocks + + +def findliteralblocks(blocks): + """Finds literal blocks and adds a 'type' field to the blocks. + + Literal blocks are given the type 'literal', all other blocks are + given type the 'paragraph'. + """ + i = 0 + while i < len(blocks): + # Searching for a block that looks like this: + # + # +------------------------------+ + # | paragraph | + # | (ends with "::") | + # +------------------------------+ + # +---------------------------+ + # | indented literal block | + # +---------------------------+ + blocks[i]['type'] = 'paragraph' + if blocks[i]['lines'][-1].endswith('::') and i+1 < len(blocks): + indent = blocks[i]['indent'] + adjustment = blocks[i+1]['indent'] - indent + + if blocks[i]['lines'] == ['::']: + # Expanded form: remove block + del blocks[i] + i -= 1 + elif blocks[i]['lines'][-1].endswith(' ::'): + # Partially minimized form: remove space and both + # colons. + blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3] + else: + # Fully minimized form: remove just one colon. + blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1] + + # List items are formatted with a hanging indent. We must + # correct for this here while we still have the original + # information on the indentation of the subsequent literal + # blocks available. + if blocks[i]['lines'][0].startswith('- '): + indent += 2 + adjustment -= 2 + + # Mark the following indented blocks. + while i+1 < len(blocks) and blocks[i+1]['indent'] > indent: + blocks[i+1]['type'] = 'literal' + blocks[i+1]['indent'] -= adjustment + i += 1 + i += 1 + return blocks + + +def findsections(blocks): + """Finds sections. + + The blocks must have a 'type' field, i.e., they should have been + run through findliteralblocks first. + """ + for block in blocks: + # Searching for a block that looks like this: + # + # +------------------------------+ + # | Section title | + # | ------------- | + # +------------------------------+ + if (block['type'] == 'paragraph' and + len(block['lines']) == 2 and + block['lines'][1] == '-' * len(block['lines'][0])): + block['type'] = 'section' + return blocks + + +def findbulletlists(blocks): + """Finds bullet lists. + + The blocks must have a 'type' field, i.e., they should have been + run through findliteralblocks first. + """ + i = 0 + while i < len(blocks): + # Searching for a paragraph that looks like this: + # + # +------+-----------------------+ + # | "- " | list item | + # +------| (body elements)+ | + # +-----------------------+ + if (blocks[i]['type'] == 'paragraph' and + blocks[i]['lines'][0].startswith('- ')): + items = [] + for line in blocks[i]['lines']: + if line.startswith('- '): + items.append(dict(type='bullet', lines=[], + indent=blocks[i]['indent'] + 2)) + line = line[2:] + items[-1]['lines'].append(line) + blocks[i:i+1] = items + i += len(items) - 1 + i += 1 + return blocks + + +_optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)? +)(.*)$') +def findoptionlists(blocks): + """Finds option lists. + + The blocks must have a 'type' field, i.e., they should have been + run through findliteralblocks first. + """ + i = 0 + while i < len(blocks): + # Searching for a paragraph that looks like this: + # + # +----------------------------+-------------+ + # | "--" option " " | description | + # +-------+--------------------+ | + # | (body elements)+ | + # +----------------------------------+ + if (blocks[i]['type'] == 'paragraph' and + _optionre.match(blocks[i]['lines'][0])): + options = [] + for line in blocks[i]['lines']: + m = _optionre.match(line) + if m: + option, arg, rest = m.groups() + width = len(option) + len(arg) + options.append(dict(type='option', lines=[], + indent=blocks[i]['indent'], + width=width)) + options[-1]['lines'].append(line) + blocks[i:i+1] = options + i += len(options) - 1 + i += 1 + return blocks + + +def finddefinitionlists(blocks): + """Finds definition lists. + + The blocks must have a 'type' field, i.e., they should have been + run through findliteralblocks first. + """ + i = 0 + while i < len(blocks): + # Searching for a paragraph that looks like this: + # + # +----------------------------+ + # | term | + # +--+-------------------------+--+ + # | definition | + # | (body elements)+ | + # +----------------------------+ + if (blocks[i]['type'] == 'paragraph' and + len(blocks[i]['lines']) > 1 and + not blocks[i]['lines'][0].startswith(' ') and + blocks[i]['lines'][1].startswith(' ')): + definitions = [] + for line in blocks[i]['lines']: + if not line.startswith(' '): + definitions.append(dict(type='definition', lines=[], + indent=blocks[i]['indent'])) + definitions[-1]['lines'].append(line) + definitions[-1]['hang'] = len(line) - len(line.lstrip()) + blocks[i:i+1] = definitions + i += len(definitions) - 1 + i += 1 + return blocks + + +def addmargins(blocks): + """Adds empty blocks for vertical spacing. + + This groups bullets, options, and definitions together with no vertical + space between them, and adds an empty block between all other blocks. + """ + i = 1 + while i < len(blocks): + if (blocks[i]['type'] == blocks[i-1]['type'] and + blocks[i]['type'] in ('bullet', 'option', 'definition')): + i += 1 + else: + blocks.insert(i, dict(lines=[''], indent=0, type='margin')) + i += 2 + return blocks + + +def formatblock(block, width): + """Format a block according to width.""" + indent = ' ' * block['indent'] + if block['type'] == 'margin': + return '' + elif block['type'] in ('literal', 'section'): + return indent + ('\n' + indent).join(block['lines']) + elif block['type'] == 'definition': + term = indent + block['lines'][0] + defindent = indent + block['hang'] * ' ' + text = ' '.join(map(str.strip, block['lines'][1:])) + return "%s\n%s" % (term, textwrap.fill(text, width=width, + initial_indent=defindent, + subsequent_indent=defindent)) + else: + initindent = subindent = indent + text = ' '.join(map(str.strip, block['lines'])) + if block['type'] == 'bullet': + initindent = indent[:-2] + '- ' + subindent = indent + elif block['type'] == 'option': + subindent = indent + block['width'] * ' ' + + return textwrap.fill(text, width=width, + initial_indent=initindent, + subsequent_indent=subindent) + + +def format(text, width): + """Parse and format the text according to width.""" + blocks = findblocks(text) + blocks = findliteralblocks(blocks) + blocks = findsections(blocks) + blocks = findbulletlists(blocks) + blocks = findoptionlists(blocks) + blocks = finddefinitionlists(blocks) + blocks = addmargins(blocks) + return '\n'.join(formatblock(b, width) for b in blocks) + + +if __name__ == "__main__": + from pprint import pprint + + def debug(func, blocks): + blocks = func(blocks) + print "*** after %s:" % func.__name__ + pprint(blocks) + print + return blocks + + text = open(sys.argv[1]).read() + blocks = debug(findblocks, text) + blocks = debug(findliteralblocks, blocks) + blocks = debug(findsections, blocks) + blocks = debug(findbulletlists, blocks) + blocks = debug(findoptionlists, blocks) + blocks = debug(finddefinitionlists, blocks) + blocks = debug(addmargins, blocks) + print '\n'.join(formatblock(b, 30) for b in blocks)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-minirst.py Thu Jul 16 23:25:25 2009 +0200 @@ -0,0 +1,138 @@ +#!/usr/bin/env python + +from mercurial import minirst + +def debugformat(title, text, width): + print "%s formatted to fit within %d characters:" % (title, width) + print "-" * 70 + print minirst.format(text, width) + print "-" * 70 + print + +paragraphs = """ +This is some text in the first paragraph. + + An indented paragraph + with just two lines. + + +The third paragraph. It is followed by some +random lines with spurious spaces. + + + + + +No indention + here, despite +the uneven left + margin. + + Only the + left-most line + (this line!) + is significant + for the indentation + +""" + +debugformat('paragraphs', paragraphs, 60) +debugformat('paragraphs', paragraphs, 30) + + +definitions = """ +A Term + Definition. The indented + lines make up the definition. +Another Term + Another definition. The final line in the + definition determines the indentation, so + this will be indented with four spaces. + + A Nested/Indented Term + Definition. +""" + +debugformat('definitions', definitions, 60) +debugformat('definitions', definitions, 30) + + +literals = r""" +The fully minimized form is the most +convenient form:: + + Hello + literal + world + +In the partially minimized form a paragraph +simply ends with space-double-colon. :: + + //////////////////////////////////////// + long un-wrapped line in a literal block + \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ + +:: + + This literal block is started with '::', + the so-called expanded form. The paragraph + with '::' disappears in the final output. +""" + +debugformat('literals', literals, 60) +debugformat('literals', literals, 30) + + +lists = """ +- This is the first list item. + + Second paragraph in the first list item. + +- List items need not be separated + by a blank line. +- And will be rendered without + one in any case. + +We can have indented lists: + + - This is an indented list item + + - Another indented list item:: + + - A literal block in the middle + of an indented list. + + (The above is not a list item since we are in the literal block.) + +:: + + Literal block with no indentation. +""" + +debugformat('lists', lists, 60) +debugformat('lists', lists, 30) + + +options = """ +There is support for simple option lists, +but only with long options: + +--all Output all. +--both Output both (this description is + quite long). +--long Output all day long. + +--par This option has two paragraphs in its description. + This is the first. + + This is the second. Blank lines may be omitted between + options (as above) or left in (as here). + +The next paragraph looks like an option list, but lacks the two-space +marker after the option. It is treated as a normal paragraph: + +--foo bar baz +""" + +debugformat('options', options, 60) +debugformat('options', options, 30)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-minirst.py.out Thu Jul 16 23:25:25 2009 +0200 @@ -0,0 +1,209 @@ +paragraphs formatted to fit within 60 characters: +---------------------------------------------------------------------- +This is some text in the first paragraph. + + An indented paragraph with just two lines. + +The third paragraph. It is followed by some random lines +with spurious spaces. + +No indention here, despite the uneven left margin. + + Only the left-most line (this line!) is significant for + the indentation +---------------------------------------------------------------------- + +paragraphs formatted to fit within 30 characters: +---------------------------------------------------------------------- +This is some text in the first +paragraph. + + An indented paragraph with + just two lines. + +The third paragraph. It is +followed by some random lines +with spurious spaces. + +No indention here, despite the +uneven left margin. + + Only the left-most line + (this line!) is significant + for the indentation +---------------------------------------------------------------------- + +definitions formatted to fit within 60 characters: +---------------------------------------------------------------------- +A Term + Definition. The indented lines make up the definition. +Another Term + Another definition. The final line in the definition + determines the indentation, so this will be indented + with four spaces. + A Nested/Indented Term + Definition. +---------------------------------------------------------------------- + +definitions formatted to fit within 30 characters: +---------------------------------------------------------------------- +A Term + Definition. The indented + lines make up the + definition. +Another Term + Another definition. The + final line in the + definition determines the + indentation, so this will + be indented with four + spaces. + A Nested/Indented Term + Definition. +---------------------------------------------------------------------- + +literals formatted to fit within 60 characters: +---------------------------------------------------------------------- +The fully minimized form is the most convenient form: + +Hello + literal + world + +In the partially minimized form a paragraph simply ends with +space-double-colon. + +//////////////////////////////////////// +long un-wrapped line in a literal block +\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ + +This literal block is started with '::', + the so-called expanded form. The paragraph + with '::' disappears in the final output. +---------------------------------------------------------------------- + +literals formatted to fit within 30 characters: +---------------------------------------------------------------------- +The fully minimized form is +the most convenient form: + +Hello + literal + world + +In the partially minimized +form a paragraph simply ends +with space-double-colon. + +//////////////////////////////////////// +long un-wrapped line in a literal block +\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ + +This literal block is started with '::', + the so-called expanded form. The paragraph + with '::' disappears in the final output. +---------------------------------------------------------------------- + +lists formatted to fit within 60 characters: +---------------------------------------------------------------------- +- This is the first list item. + + Second paragraph in the first list item. + +- List items need not be separated by a blank line. +- And will be rendered without one in any case. + +We can have indented lists: + + - This is an indented list item + - Another indented list item: + + - A literal block in the middle + of an indented list. + + (The above is not a list item since we are in the literal block.) + +Literal block with no indentation. +---------------------------------------------------------------------- + +lists formatted to fit within 30 characters: +---------------------------------------------------------------------- +- This is the first list item. + + Second paragraph in the + first list item. + +- List items need not be + separated by a blank line. +- And will be rendered without + one in any case. + +We can have indented lists: + + - This is an indented list + item + - Another indented list + item: + + - A literal block in the middle + of an indented list. + + (The above is not a list item since we are in the literal block.) + +Literal block with no indentation. +---------------------------------------------------------------------- + +options formatted to fit within 60 characters: +---------------------------------------------------------------------- +There is support for simple option lists, but only with long +options: + +--all Output all. +--both Output both (this description is quite long). +--long Output all day long. +--par This option has two paragraphs in its + description. This is the first. + + This is the second. Blank lines may be omitted + between options (as above) or left in (as here). + +The next paragraph looks like an option list, but lacks the +two-space marker after the option. It is treated as a normal +paragraph: + +--foo bar baz +---------------------------------------------------------------------- + +options formatted to fit within 30 characters: +---------------------------------------------------------------------- +There is support for simple +option lists, but only with +long options: + +--all Output all. +--both Output both (this + description is + quite long). +--long Output all day + long. +--par This option has two + paragraphs in its + description. This + is the first. + + This is the second. + Blank lines may be + omitted between + options (as above) + or left in (as + here). + +The next paragraph looks like +an option list, but lacks the +two-space marker after the +option. It is treated as a +normal paragraph: + +--foo bar baz +---------------------------------------------------------------------- +