#!/usr/bin/python
#
# makeman -- compile netpbm's stereotyped HTML to troff markup
#
# This approach works because we control the entire document universe
# this is going to convert and can reinforce useful stereotypes.
#
# The output of this tool uses cliches parseable by doclifter,
# which should thus be able to recover all the semantic information
# it looks like this thing is losing.
#
# Known bugs:
# * Ordered lists are smashed into unordered lists
#
# Limitations:
# * IMG tags are issued as .IMG preceded by a bolded caption containing
# the alt content. This will only work if the page is formatted with
# mwww macros.
# * Loses summary information from tables.
# * Only permits one <HR> in the HTML, right before the index.
#
# You can use the <?makeman ?> PI to pass text directly through to the
# generated manual page, A major use is to insert format lines for tables.
#
# By Eric S. Raymond <esr@thyrsus.com>
# Version 1.0, July 26 2004
#
# Modified by Akira F. Urushibata <afu@wta.att.ne.jp>
# Version 1.1, February 11 2016
#
# Added ability to process — −
# Added footer message to clarify original source.
#
import os, sys, re
source = "netpbm documentation"
section = 1
warning = r'''\
.\" This man page was generated by the Netpbm tool 'makeman' from HTML source.
.\" Do not hand-hack it! If you have bug fixes or improvements, please find
.\" the corresponding HTML page on the Netpbm website, generate a patch
.\" against that, and send it to the Netpbm maintainer.
'''
footerprefix = '''.SH DOCUMENT SOURCE
This manual page was generated by the Netpbm tool 'makeman' from HTML
source. The master documentation is at
.IP
.B http://netpbm.sourceforge.net/doc/'''
class LiftException(Exception):
def __init__(self, message, retval=1):
self.message = message
self.retval = retval
def makeman(name, file, indoc):
"Transform a string representing an HTML document into man markup."
global section, sectmap
# Dot at left margin confuses troff.
# This program generates these,
indoc = indoc.replace("\n.", "\n@%@%@")
# Protect escapes before we try generating font changes.
indoc = indoc.replace("\\", r"\e")
# Header-bashing
indoc = re.sub('(?i)<!DOCTYPE html[^>]*>', "", indoc)
indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">', "")
indoc = indoc.replace('<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"/>', "")
indoc = indoc.replace('<?xml version="1.1" encoding="iso-8859-1" ?>\n',"")
indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml">', "")
indoc = indoc.replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">', "")
indoc = indoc.replace("<HEAD>", "").replace("</HEAD>", "")
indoc = indoc.replace("<head>", "").replace("</head>", "")
indoc = re.sub('(?i)<A HREF="#index">Table Of Contents</A>', "", indoc)
datematch = re.compile("Updated: (.*)\n")
match = datematch.search(indoc)
if match:
date = match.group(1)
else:
date = ""
indoc = datematch.sub("", indoc)
namematch = re.compile("<H1>(.*)</H1>", re.I)
match = namematch.search(indoc)
if match:
name = match.group(1)
else:
name = None
section = 1
meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
match = meta.search(indoc)
if match:
section = int(match.group(1))
indoc = meta.sub("", indoc)
else:
section = sectmap.get(name, 0)
indoc = namematch.sub("", indoc)
indoc = re.sub("(?i)<BODY[^>]*>", "", indoc)
indoc = re.sub("(?i)<HTML>", "", indoc)
# Remove more superfluous headers
titlematch = re.compile("<TITLE>(.*)</TITLE>\n+", re.I)
match = titlematch.search(indoc)
if match:
title = match.group(1)
else:
title = None
indoc = titlematch.sub("", indoc)
indoc = re.sub("(?i)\n*<BR>\n+", "\n", indoc)
indoc = re.sub("(?i)<BR>", "\n", indoc)
indoc = ('.TH "%s" %d "%s" "%s"\n' % (title,section,date,source)) + indoc
# Literal layout
indoc = re.sub("(?i)\n *<PRE>", "\n.nf", indoc)
indoc = re.sub("(?i)\n *</PRE>", "\n.fi", indoc)
indoc = re.sub("(?i)\n *<BLOCKQUOTE>", "\n.RS", indoc)
indoc = re.sub("(?i)\n *</BLOCKQUOTE>", "\n.RE", indoc)
# Highlight processing
indoc = re.sub("(?i)<B>", r"\\fB", indoc)
indoc = re.sub("(?i)</B>", r"\\fP", indoc)
indoc = re.sub("(?i)<EM>", r"\\fI", indoc)
indoc = re.sub("(?i)</EM>", r"\\fP", indoc)
indoc = re.sub("(?i)<CITE>", r"\\fI", indoc)
indoc = re.sub("(?i)</CITE>", r"\\fP", indoc)
indoc = re.sub("(?i)<I>", r"\\fI", indoc)
indoc = re.sub("(?i)</I>", r"\\fP", indoc)
indoc = re.sub("(?i)<TT>", r"\\f(CW", indoc)
indoc = re.sub("(?i)</TT>", r"\\fP", indoc)
indoc = re.sub("(?i)<KBD>", r"\\f(CW", indoc)
indoc = re.sub("(?i)</KBD>", r"\\fP", indoc)
indoc = re.sub("(?i)<CODE>", r"\\f(CW", indoc)
indoc = re.sub("(?i)</CODE>", r"\\fP", indoc)
indoc = re.sub("(?i)<STRONG>", r"\\fB", indoc)
indoc = re.sub("(?i)</STRONG>", r"\\fP", indoc)
indoc = re.sub("(?i)<SUP>", r"\\u", indoc)
indoc = re.sub("(?i)</SUP>", r"\\d", indoc)
# Paragraph handling
indoc = re.sub("(?i)\n*<P>\n*", r"\n.PP\n", indoc)
indoc = re.sub("(?i)<br */>", r"\n.PP\n", indoc)
indoc = re.sub("(?i)</P>", "", indoc)
indoc = re.sub("(?i)<!--[^>]*-->", "", indoc)
indoc = re.sub("(?i)<meta[^>]*>", "", indoc)
lines = indoc.split("\n")
listdepth = 0
for i in range(len(lines)):
lowered = lines[i].lower()
if "<dl" in lowered or "<ol" in lowered or "<ul" in lowered:
listdepth += 1
if listdepth:
lines[i] = lines[i].replace(".PP", ".sp")
if "</dl>" in lowered or "</ol>" in lowered or "</ul>" in lowered:
listdepth -= 1
indoc = "\n".join(lines)
indoc = re.sub(r"\s*\.sp", "\n.sp", indoc)
# Format email addresses as italic
indoc = re.sub('(?i)<A[ \n]+HREF="mailto:[^>]+">([^<]+)</A>', r'\\fI\1\\fP', indoc)
# Format manual crossreferences
def xrefmatch(match):
xrefto = match.group(2)
xrefurl = match.group(1)
xrefsection = sectmap.get(xrefurl, 1)
if xrefsection == 0:
return "\n.I " + xrefto
else:
return '\n.BR "%s" (%d)\\c\n\\&' % (xrefto, xrefsection)
indoc = re.sub(r'(?i)\n* *(?:\\fB)?<A[ \n]+HREF="?([^>]+.html)"?>([^<]+)</A>(?:\\fP)?',
xrefmatch, indoc)
# Format URLs
def urlmatch(match):
url = match.group(1).replace('\n', ' ')
txt = match.group(2).replace('\n', ' ')
return "\n.UR %s\n%s\n.UE\n\\&" % (url, txt)
indoc = re.sub(r'(?i)\n*(?:<)?<A[ \n]+HREF *= *"([^>]+)">([^<]+)</A>(?:>)?',
urlmatch, indoc)
# Turn some entities into harmless cookies
indoc = indoc.replace("<", "@#!#@").replace(">", "#@!@#").replace("&", "#!@!@!#")
indoc = indoc.replace("×", r"\(mu")
indoc = indoc.replace("®", r"\*R")
indoc = indoc.replace("©", r"\(co")
# Turn anchors into .UN tags
indoc = re.sub('(?i)<A NAME *= *"#?([a-zA-Z_][a-zA-Z_0-9.-]+)">(?: )*</A>\s*', ".UN \\1\n", indoc)
# Strip off the index trailer
trailer = re.compile('<HR */*>.*', re.DOTALL | re.IGNORECASE)
indoc = re.sub(trailer, "", indoc)
# If there was no index trailer, we still need to strip these
indoc = indoc.replace("</BODY>", "").replace("</HTML>", "")
indoc = indoc.replace("</body>", "").replace("</html>", "")
# Recognize sections with IDs
indoc = re.sub('(?i)<H2><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H2>',
".UN \\1\n.SH \\2", indoc)
indoc = re.sub('(?i)<H3><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H3>',
".UN \\1\n.SS \\2", indoc)
indoc = re.sub('(?i)<H4><A (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</A></H4>',
".UN \\1\n.B \\2", indoc)
indoc = re.sub('(?i)<H2 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H2>',
".UN \\1\n.SH \\2", indoc)
indoc = re.sub('(?i)<H3 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H3>',
".UN \\1\n.SS \\2", indoc)
indoc = re.sub('(?i)<H4 (?:ID|NAME)="([a-zA-Z][_a-zA-Z0-9-]+)">([^><]*)</H4>',
".UN \\1\n.B \\2", indoc)
# Sections without IDs
indoc = re.sub('(?i)<H2>([^><]*)</H2>', ".SH \\1", indoc)
indoc = re.sub('(?i)<H3>([^><]*)</H3>', ".SS \\1", indoc)
indoc = re.sub('(?i)<H4>([^><]*)</H4>', ".B \\1", indoc)
#
# Process definition lists -- just turn them into .TPs
indoc = re.sub("(?i) *<DL *(COMPACT)?>", "", indoc)
indoc = re.sub("(?i) *</DL>", "", indoc)
indoc = re.sub("(?i) *<DT>", ".TP\n", indoc)
indoc = re.sub("(?i) *</DT>", "", indoc)
indoc = re.sub("(?i)\n*<DD>\n*", "\n", indoc)
indoc = re.sub("(?i) *</DD>", "", indoc)
# Process unordered lists -- just turn them into .TPs
indoc = re.sub("(?i)</?[UO]L *(COMPACT)?>", "", indoc)
indoc = re.sub("(?i) *<LI>", ".IP \(bu\n", indoc)
indoc = re.sub("(?i) *</LI>", "", indoc)
# No-print tags
indoc = re.sub("<!--no_print-->.*", "", indoc)
# Passthrough
indoc = re.sub(r"<\?makeman (.*) \?>", r'\1', indoc)
# Comments
indoc = re.sub("<!--([^\n])*-->", r'.\"\1', indoc)
# Acronyms
indoc = re.sub('<acronym [a-zA-Z0-9:= \n"]*>', "", indoc)
indoc = re.sub("</acronym>", "", indoc)
# Image tags
indoc = re.sub(' *<img src="([^"]*)" alt="([^"]*)"( *[a-z]*="?[0-9]*"?)*>', ".B \\2\n.IMG -C \\1", indoc)
# Special characters
indoc = indoc.replace(""", "'")
indoc = indoc.replace(" ", "\\ ")
indoc = indoc.replace("−", "-")
indoc = indoc.replace("—", "-")
indoc = indoc.replace("μ", "mu")
indoc = indoc.replace("σ", "sigma")
# Tables
# This will not handle rowspan
indoc = re.sub('(?i) *<table[^>]*>.*', ".TS", indoc)
indoc = re.sub("(?i) *</table>.*", ".TE", indoc)
# First the single-line case
indoc = re.sub("(?i)</td> *<td>", "\t", indoc)
indoc = re.sub("(?i)<tr> *<td>", "", indoc)
indoc = re.sub("(?i)</td> *</tr>", "", indoc)
# Then the multiline case
indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<\n]*)</t[dh]>\s*', '\t\\1', indoc)
indoc = re.sub(r'(?i)\s*<t[hd][^>]*>([^<]*)</t[dh]>\s*', '\tT{\n\\1T}', indoc)
indoc = indoc.replace("\n\\&T}", "\nT}")
indoc = re.sub("(?i) *</tr>", "", indoc)
indoc = re.sub("(?i) *<tr[^>]*>\t*", "", indoc)
indoc = re.sub(r"\.TS\s+<[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>([^<]*)</[Cc][Aa][Pp][Tt][Ii][Oo][Nn]>\s*", ".B \\1\n.TS\n", indoc)
# Debugging
#sys.stderr.write("Name: %s, Title: %s, Date: %s\n" % (name, title, date))
# Time for error checking now
badlines = []
for line in indoc.split("\n"):
if "<" in line or ">" in line.replace(" >", "") or re.search(r'(?<!^\\)&.*;', line):
badlines.append(line)
if badlines:
sys.stderr.write(("Bad lines from %s:\n-----------------\n" % file) + "\n".join(badlines) + "\n-----------------\n")
# Goes after bad-line check so we don't misinterpret it as an error
indoc = indoc.replace("@#!#@", "<").replace("#@!@#", ">").replace("#!@!@!#", "&")
indoc = re.sub("\n+$", "\n", indoc)
# Single-quote at left margin confuses troff.
# This program never generates these.
indoc = indoc.replace("\n'", "\n\\&'")
# Finish guarding against leading dots.
indoc = indoc.replace("\n@%@%@", "\n\\&.")
# Mark these generated pages so people won't hand-hack them.
indoc = warning + indoc
indoc = indoc + footerprefix + os.path.basename(file) +"\n.PP"
return indoc
def main(args, mainout=sys.stdout, mainerr=sys.stderr):
global sectmap
import getopt
(options, arguments) = getopt.getopt(args, "vd:")
dirprefix = ""
verbosity = 0
for (switch, val) in options:
if switch == '-d': # Set HTML input directory
dirprefix = val
elif switch == '-v': # Enable verbose error reporting
verbosity += 1
try:
# First pass: gather locations for crossreferences:
sectmap = {}
for file in arguments:
try:
infp = open(os.path.join(dirprefix, file))
except:
sys.stderr.write("makeman: can't open %s\n" % file)
continue
indoc = infp.read()
infp.close()
namere = re.compile("<H1>(.*)</H1>", re.I)
namematch = namere.search(indoc)
titlere = re.compile("<TITLE>(.*)</TITLE>", re.I)
titlematch = titlere.search(indoc)
if not namematch:
raise LiftException("name missing from %s" % file)
if not titlematch:
raise LiftException("title missing from %s" % file)
else:
title = titlematch.group(1)
name = titlematch.group(1)
meta = re.compile('(?i)<META NAME="manual_section" CONTENT="([0-9])">')
match = meta.search(indoc)
if match:
section = int(match.group(1))
sectmap[title] = sectmap[file] = sectmap[name] = section
else:
sectmap[title] = sectmap[file] = sectmap[name] = 1
hr = re.compile("(?i)<HR>")
firsthr = hr.search(indoc)
if firsthr and hr.search(indoc[firsthr.start(0)+4:]):
LiftException("%s has two <HR> tags!" % file)
# Second pass: do formatting
for file in arguments:
try:
infp = open(os.path.join(dirprefix, file))
except:
sys.stderr.write("makeman: can't open %s\n" % file)
continue
indoc = infp.read()
infp.close()
tempfile = file + ".~%s-%d~" % (name, os.getpid())
try:
outfp = open(tempfile, "w")
except OSError:
sys.stderr.write("%s: can't open tempfile" % name)
return True
try:
if verbosity:
sys.stderr.write("makeman: %s\n" % file)
outdoc = makeman(name, file, indoc)
except:
os.remove(tempfile)
raise
if outdoc == indoc:
os.remove(tempfile)
if outdoc is None:
continue
else:
outfp.write(outdoc)
outfp.close() # under Windows you can't rename an open file
stem = file[:file.find(".")]
os.rename(tempfile, stem + "." + repr(sectmap[file]))
except LiftException as e:
mainerr.write("makeman: " + e.message + "\n")
return e.retval
except IOError as e:
mainerr.write("makeman: file I/O error: %s\n" % e)
return 3
except KeyboardInterrupt:
mainerr.write("makeman: bailing out...\n")
return 4
except:
if verbosity:
raise
else:
mainerr.write("makeman: internal error!\n")
return 5
if __name__ == "__main__":
# Run the main sequence
raise SystemExit(main(sys.argv[1:]))
# The following sets edit modes for GNU EMACS
# Local Variables:
# mode:python
# End: