#!/usr/bin/pypy

import sys, re, argparse

SGML_TAG = r"""
    (?:                         # make enclosing parantheses non-grouping
    <!-- .*? -->                # XML/SGML comment
    |                           # -- OR --
    <[!?/]?(?!\d)\w[-\.:\w]*    # Start of tag/directive
    (?:                         # Attributes
        [^>'"]*                 # - attribute name (+whitespace +equal sign)
        (?:'[^']*'|"[^"]*")     # - attribute value
    )* 
    \s*                         # Spaces at the end
    /?                          # Forward slash at the end of singleton tags
    \s*                         # More spaces at the end
    >                           # +End of tag/directive
    )"""
SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)
SENT_TAG_RE = re.compile(r'^</?s[ >]')
GLUE_TAG = '<g/>'

parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog="Converts a (tagged) vertical to untagged plain text file. SGML tags are preserved."
)
parser.add_argument("-l", "--max-line-width", help="maximum line width (0 = no limit)", type=int, default=80)
parser.add_argument("-s", "--no-sentences", help="strip sentence (<s>) tags", action="store_true")
args = parser.parse_args()

glue = False
line_width = 0
for line in sys.stdin:
    line = line.strip()
    if SGML_TAG_RE.match(line):
        if args.no_sentences and SENT_TAG_RE.match(line):
            continue
        if line == GLUE_TAG:
            glue = True
            continue
        if line_width > 0:
            sys.stdout.write('\n')
            line_width = 0
        sys.stdout.write(line + '\n')
    else:
        token = line.split('\t')[0]
        if line_width > 0 and not glue:
            if args.max_line_width and line_width + len(token) + 1 > args.max_line_width:
                sys.stdout.write('\n')
                line_width = 0
            else:
                sys.stdout.write(' ')
                line_width+= 1
        sys.stdout.write(token)
        line_width+= len(token)
    glue = False
