Blob Blame History Raw
#!/usr/libexec/platform-python
#
# makeman -- compile netpbm's stereotyped HTML to troff markup
#
# This approach works because we control the entire document universe 
# this is going to convert and can reinforce useful stereotypes.
#
# The output of this tool uses cliches parseable by doclifter,
# which should thus be able to recover all the semantic information
# it looks like this thing is losing.
#
# Known bugs:
#  * Ordered lists are smashed into unordered lists
#
# Limitations:
#  * IMG tags are issued as .IMG preceded by a bolded caption containing
#    the alt content.  This will only work if the page is formatted with
#    mwww macros.
#  * Loses summary information from tables.
#  * Only permits one <HR> in the HTML, right before the index.
#
# You can use the <?makeman ?> PI to pass text directly through to the
# generated manual page,  A major use is to insert format lines for tables.
#
# By Eric S. Raymond <esr@thyrsus.com>
# Version 1.0, July 26 2004
#
# Modified by Akira F. Urushibata <afu@wta.att.ne.jp>
# Version 1.1, February 11 2016
#
#   Added ability to process &mdash; &minus;
#   Added footer message to clarify original source. 
#

import os, sys, re

source = "netpbm documentation"
section = 1

warning = r'''\
.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
.\" Do not hand-hack it!  If you have bug fixes or improvements, please find
.\" the corresponding HTML page on the Netpbm website, generate a patch
.\" against that, and send it to the Netpbm maintainer.
'''

footerprefix = '''.SH DOCUMENT SOURCE
This manual page was generated by the Netpbm tool 'makeman' from HTML
source.  The master documentation is at
.IP
.B http://netpbm.sourceforge.net/doc/'''

class LiftException(Exception):
    def __init__(self, message, retval=1):
        self.message = message
        self.retval = retval

def makeman(name, file, indoc):
    "Transform a string representing an HTML document into man markup."
    global section, sectmap
    # Dot at left margin confuses troff.
    # This program generates these,
    indoc = indoc.replace("\n.", "\n@%@%@")
    # Protect escapes before we try generating font changes.
    indoc = indoc.replace("\\", r"\e")
    # Header-bashing
    indoc = re.sub('(?i)<!DOCTYPE html[^>]*>', "", indoc)
    indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "")
    indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"/>', "")
    indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"")
    indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "")
    indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', "")
    indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "")
    indoc = indoc.replace("<head>", "").replace("</head>", "")
    indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc)
    datematch = re.compile("Updated: (.*)\n")
    match = datematch.search(indoc)
    if match:
        date = match.group(1)
    else:
        date = ""
    indoc = datematch.sub("", indoc)
    namematch = re.compile("<H1>(.*)</H1>", re.I)
    match = namematch.search(indoc)
    if match:
        name = match.group(1)
    else:
        name = None
    section = 1
    meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
    match = meta.search(indoc)
    if match:
        section = int(match.group(1))
        indoc = meta.sub("", indoc)
    else:
        section = sectmap.get(name, 0)
    indoc = namematch.sub("", indoc)
    indoc = re.sub("(?i)<BODY[^>]*>", "", indoc)
    indoc = re.sub("(?i)<HTML>", "", indoc)
    # Remove more superfluous headers
    titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I)
    match = titlematch.search(indoc)
    if match:
        title = match.group(1)
    else:
        title = None
    indoc = titlematch.sub("", indoc)
    indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc)
    indoc = re.sub("(?i)<BR>", "\n", indoc)
    indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
    # Literal layout
    indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
    indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
    indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc)
    indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc)
    # Highlight processing
    indoc = re.sub("(?i)<B>", r"\\fB", indoc)
    indoc = re.sub("(?i)</B>", r"\\fP", indoc)
    indoc = re.sub("(?i)<EM>", r"\\fI", indoc)
    indoc = re.sub("(?i)</EM>", r"\\fP", indoc)
    indoc = re.sub("(?i)<CITE>", r"\\fI", indoc)
    indoc = re.sub("(?i)</CITE>", r"\\fP", indoc)
    indoc = re.sub("(?i)<I>", r"\\fI", indoc)
    indoc = re.sub("(?i)</I>", r"\\fP", indoc)
    indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc)
    indoc = re.sub("(?i)</TT>", r"\\fP", indoc)
    indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc)
    indoc = re.sub("(?i)</KBD>", r"\\fP", indoc)
    indoc = re.sub("(?i)<CODE>", r"\\f(CW", indoc)
    indoc = re.sub("(?i)</CODE>", r"\\fP", indoc)
    indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc)
    indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc)
    indoc = re.sub("(?i)<SUP>", r"\\u", indoc)
    indoc = re.sub("(?i)</SUP>", r"\\d", indoc)
    # Paragraph handling
    indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc)
    indoc = re.sub("(?i)<br */>", r"\n.PP\n", indoc)
    indoc = re.sub("(?i)</P>", "", indoc)
    indoc = re.sub("(?i)<!--[^>]*-->", "", indoc)
    indoc = re.sub("(?i)<meta[^>]*>", "", indoc)
    lines = indoc.split("\n")
    listdepth = 0
    for i in range(len(lines)):
        lowered = lines[i].lower()
        if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered:
            listdepth += 1
        if listdepth:
            lines[i] = lines[i].replace(".PP", ".sp")
        if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered:
            listdepth -= 1
    indoc = "\n".join(lines)
    indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
    # Format email addresses as italic
    indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)    
    # Format manual crossreferences
    def xrefmatch(match):
        xrefto = match.group(2)
        xrefurl = match.group(1)
        xrefsection = sectmap.get(xrefurl, 1)
        if xrefsection == 0:
            return "\n.I " + xrefto
        else:
            return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection)
    indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="?([^>]+.html)"?>([^<]+)</A>(?:\\fP)?',
                   xrefmatch, indoc)
    # Format URLs
    def urlmatch(match):
        url = match.group(1).replace('\n', ' ')
        txt = match.group(2).replace('\n', ' ')
        return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
    indoc = re.sub(r'(?i)\n*(?:&lt;)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:&gt;)?',
                  urlmatch, indoc)
    # Turn some entities into harmless cookies
    indoc = indoc.replace("&lt;", "@#!#@").replace("&gt;", "#@!@#").replace("&amp;", "#!@!@!#")
    indoc = indoc.replace("&#215;", r"\(mu")
    indoc = indoc.replace("&#174;", r"\*R")
    indoc = indoc.replace("&copy;", r"\(co")
    # Turn anchors into .UN tags
    indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?:&nbsp;)*</A>\s*', ".UN \\1\n", indoc)
    # Strip off the index trailer
    trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE)
    indoc = re.sub(trailer, "", indoc)
    # If there was no index trailer, we still need to strip these
    indoc = indoc.replace("</BODY>", "").replace("</HTML>", "")
    indoc = indoc.replace("</body>", "").replace("</html>", "")
    # Recognize sections with IDs
    indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H2>',
                   ".UN \\1\n.SH \\2", indoc)
    indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H3>',
                   ".UN \\1\n.SS \\2", indoc)
    indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H4>',
                   ".UN \\1\n.B \\2", indoc)
    indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H2>',
                   ".UN \\1\n.SH \\2", indoc)
    indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H3>',
                   ".UN \\1\n.SS \\2", indoc)
    indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H4>',
                   ".UN \\1\n.B \\2", indoc)
    # Sections without IDs
    indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
    indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
    indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
    # 
    # Process definition lists -- just turn them into .TPs
    indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc)
    indoc = re.sub("(?i) *</DL>", "", indoc)
    indoc = re.sub("(?i) *<DT>", ".TP\n", indoc)
    indoc = re.sub("(?i) *</DT>", "", indoc)
    indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc)
    indoc = re.sub("(?i) *</DD>", "", indoc)
    # Process unordered lists -- just turn them into .TPs
    indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
    indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
    indoc = re.sub("(?i) *</LI>", "", indoc)
    # No-print tags
    indoc = re.sub("<!--no_print-->.*", "", indoc)
    # Passthrough
    indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
    # Comments
    indoc = re.sub("<!--([^\n])*-->", r'.\"\1', indoc)
    # Acronyms
    indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc)
    indoc = re.sub("</acronym>", "", indoc)
    # Image tags
    indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc)
    # Special characters
    indoc = indoc.replace("&quot;", "'")
    indoc = indoc.replace("&nbsp;", "\\ ")
    indoc = indoc.replace("&minus;", "-")
    indoc = indoc.replace("&mdash;", "-")
    indoc = indoc.replace("&mu;", "mu")
    indoc = indoc.replace("&sigma;", "sigma")
    # Tables
    # This will not handle rowspan
    indoc = re.sub('(?i) *<table[^>]*>.*', ".TS", indoc)
    indoc = re.sub("(?i) *</table>.*", ".TE", indoc)
    # First the single-line case
    indoc = re.sub("(?i)</td> *<td>", "\t", indoc)
    indoc = re.sub("(?i)<tr> *<td>", "", indoc)
    indoc = re.sub("(?i)</td> *</tr>", "", indoc)
    # Then the multiline case
    indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc)
    indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc)
    indoc = indoc.replace("\n\\&T}", "\nT}")
    indoc = re.sub("(?i) *</tr>", "", indoc)
    indoc = re.sub("(?i) *<tr[^>]*>\t*", "", indoc)
    indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)</[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s*", ".B \\1\n.TS\n", indoc)
    # Debugging
    #sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
    # Time for error checking now
    badlines = []
    for line in indoc.split("\n"):
        if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line):
            badlines.append(line)
    if badlines:
        sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n")
    # Goes after bad-line check so we don't misinterpret it as an error
    indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&")
    indoc = re.sub("\n+$", "\n", indoc)
    # Single-quote at left margin confuses troff.
    # This program never generates these.
    indoc = indoc.replace("\n'", "\n\\&'")
    # Finish guarding against leading dots.
    indoc = indoc.replace("\n@%@%@", "\n\\&.")
    # Mark these generated pages so people won't hand-hack them.
    indoc = warning + indoc
    indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP"
    return indoc

def main(args, mainout=sys.stdout, mainerr=sys.stderr):
    global sectmap
    import getopt
    (options, arguments) = getopt.getopt(args, "vd:")
    dirprefix = ""
    verbosity = 0
    for (switch, val) in options:
        if switch == '-d':	# Set HTML input directory
            dirprefix = val
        elif switch == '-v':	# Enable verbose error reporting
            verbosity += 1
    try:
        # First pass: gather locations for crossreferences:
        sectmap = {}
        for file in arguments:
            try: 
                infp = open(os.path.join(dirprefix, file))
            except:
                sys.stderr.write("makeman: can't open %s\n" % file)
                continue
            indoc = infp.read()
            infp.close()
            namere = re.compile("<H1>(.*)</H1>", re.I)
            namematch = namere.search(indoc)
            titlere = re.compile("<TITLE>(.*)</TITLE>", re.I)
            titlematch = titlere.search(indoc)
            if not namematch:
                raise LiftException("name missing from %s" % file)
            if not titlematch:
                raise LiftException("title missing from %s" % file)
            else:
                title = titlematch.group(1)
                name = titlematch.group(1)
            meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
            match = meta.search(indoc)
            if match:
                section = int(match.group(1))
                sectmap[title] = sectmap[file] = sectmap[name] = section
            else:
                sectmap[title] = sectmap[file] = sectmap[name] = 1
            hr = re.compile("(?i)<HR>")
            firsthr = hr.search(indoc)
            if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
                LiftException("%s has two <HR> tags!" % file)
        # Second pass: do formatting
        for file in arguments:
            try: 
                infp = open(os.path.join(dirprefix, file))
            except:
                sys.stderr.write("makeman: can't open %s\n" % file)
                continue
            indoc = infp.read()
            infp.close()
            tempfile = file + ".~%s-%d~" % (name, os.getpid())
            try:
                outfp = open(tempfile, "w")
            except OSError:
                sys.stderr.write("%s: can't open tempfile" % name)
                return True
            try:
                if verbosity:
                    sys.stderr.write("makeman: %s\n" % file)
                outdoc = makeman(name, file, indoc)
            except:
                os.remove(tempfile)
                raise
            if outdoc == indoc:
                os.remove(tempfile)
            if outdoc is None:
                continue
            else:
                outfp.write(outdoc)
                outfp.close()	# under Windows you can't rename an open file
                stem = file[:file.find(".")]
                os.rename(tempfile, stem + "." + repr(sectmap[file]))
    except LiftException as e:
        mainerr.write("makeman: " + e.message + "\n")
        return e.retval
    except IOError as e:
        mainerr.write("makeman: file I/O error: %s\n" % e)
        return 3
    except KeyboardInterrupt:
        mainerr.write("makeman: bailing out...\n")
        return 4
    except:
        if verbosity:
            raise
        else:
            mainerr.write("makeman: internal error!\n")
            return 5

if __name__ == "__main__":
    # Run the main sequence
    raise SystemExit(main(sys.argv[1:]))

# The following sets edit modes for GNU EMACS
# Local Variables:
# mode:python
# End: