|
Packit |
575503 |
# dtdgport.awk
|
|
Packit |
575503 |
# Reads an XML document from standard input and
|
|
Packit |
575503 |
# prints a DTD for this document to standard output.
|
|
Packit |
575503 |
# http://saxon.sourceforge.net/dtdgen.html
|
|
Packit |
575503 |
# JK 2004-10-09
|
|
Packit |
575503 |
# JK 2006-03-05
|
|
Packit |
575503 |
# JK 2007-08-05 Converted for portability from the original dtd_generator.awk
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# The program makes an internal list of all the elements
|
|
Packit |
575503 |
# and attributes that appear in your document, noting how
|
|
Packit |
575503 |
# they are nested, and noting which elements contain
|
|
Packit |
575503 |
# character data.
|
|
Packit |
575503 |
|
|
Packit |
575503 |
BEGIN {
|
|
Packit |
575503 |
while (getXMLEVENT(ARGV[1])) {
|
|
Packit |
575503 |
# Remember each element.
|
|
Packit |
575503 |
if ( XMLEVENT == "STARTELEM" ) {
|
|
Packit |
575503 |
# Remember the parent names of each child node.
|
|
Packit |
575503 |
name[XMLDEPTH] = XMLSTARTELEM
|
|
Packit |
575503 |
if (XMLDEPTH>1)
|
|
Packit |
575503 |
child[name[XMLDEPTH-1], XMLSTARTELEM] ++
|
|
Packit |
575503 |
# Count how often the element occurs.
|
|
Packit |
575503 |
elem[XMLSTARTELEM] ++
|
|
Packit |
575503 |
# Remember all the attributes with the element.
|
|
Packit |
575503 |
for (a in XMLATTR)
|
|
Packit |
575503 |
attr[XMLSTARTELEM,a] ++
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
END { print_elem(1, name[1]) } # name[1] is the root
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# Print one element (including sub-elements) but only once.
|
|
Packit |
575503 |
function print_elem(depth, element, c, atn, chl, n, i, myChildren) {
|
|
Packit |
575503 |
if (already_printed[element]++)
|
|
Packit |
575503 |
return
|
|
Packit |
575503 |
indent=sprintf("%*s", 2*depth-2, "")
|
|
Packit |
575503 |
myChildren=""
|
|
Packit |
575503 |
for (c in child) {
|
|
Packit |
575503 |
split(c, chl, SUBSEP)
|
|
Packit |
575503 |
if (element == chl[1]) {
|
|
Packit |
575503 |
if (myChildren=="")
|
|
Packit |
575503 |
myChildren = chl[2]
|
|
Packit |
575503 |
else
|
|
Packit |
575503 |
myChildren = myChildren " | " chl[2]
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
# If an element has no child nodes, declare it as such.
|
|
Packit |
575503 |
if (myChildren=="")
|
|
Packit |
575503 |
print indent ""
|
|
Packit |
575503 |
else
|
|
Packit |
575503 |
print indent ""
|
|
Packit |
575503 |
# After the element name itself, list its attributes.
|
|
Packit |
575503 |
for (a in attr) {
|
|
Packit |
575503 |
split(a, atn, SUBSEP)
|
|
Packit |
575503 |
# Treat only those attributes that belong to the current element.
|
|
Packit |
575503 |
if (element == atn[1]) {
|
|
Packit |
575503 |
# If an attribute occured each time with its element, notice this.
|
|
Packit |
575503 |
if (attr[element, atn[2]] == elem[element])
|
|
Packit |
575503 |
print indent ""
|
|
Packit |
575503 |
else
|
|
Packit |
575503 |
print indent ""
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
# Now go through the child nodes of this elements and print them.
|
|
Packit |
575503 |
gsub(/[\|]/, " ", myChildren)
|
|
Packit |
575503 |
n=split(myChildren, chl)
|
|
Packit |
575503 |
for(i=1; i<=n; i++) {
|
|
Packit |
575503 |
print_elem(depth+1, chl[i])
|
|
Packit |
575503 |
split(myChildren, chl)
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
##
|
|
Packit |
575503 |
# getXMLEVENT( file ): # read next xml-data into XMLEVENT,XMLNAME,XMLATTR
|
|
Packit |
575503 |
# # referenced entities are not resolved
|
|
Packit |
575503 |
# Parameters:
|
|
Packit |
575503 |
# file -- path to xml file
|
|
Packit |
575503 |
# External variables:
|
|
Packit |
575503 |
# XMLEVENT -- type of item read, e.g. "STARTELEM"(tag), "ENDELEM"(end tag),
|
|
Packit |
575503 |
# "COMMENT"(comment), "CHARDATA"(data)
|
|
Packit |
575503 |
# XMLNAME -- value of item, e.g. tagname if type is "STARTELEM" or "ENDELEM"
|
|
Packit |
575503 |
# XMLATTR -- Map of attributes, only set if XMLEVENT=="STARTELEM"
|
|
Packit |
575503 |
# XMLPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2
|
|
Packit |
575503 |
# XMLROW -- current line number in input file
|
|
Packit |
575503 |
# XMLERROR -- error text, set on parse error
|
|
Packit |
575503 |
# Returns:
|
|
Packit |
575503 |
# 1 on successful read: XMLEVENT, XMLNAME, XMLATTR are set accordingly
|
|
Packit |
575503 |
# "" at end of file or parse error, XMLERROR is set on error
|
|
Packit |
575503 |
# Private Data:
|
|
Packit |
575503 |
# _XMLIO -- buffer, XMLROW, XMLPATH for open files
|
|
Packit |
575503 |
##
|
|
Packit |
575503 |
|
|
Packit |
575503 |
function getXMLEVENT( file ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) {
|
|
Packit |
575503 |
XMLEVENT=XMLNAME=XMLERROR=XMLSTARTELEM=XMLENDELEM = ""
|
|
Packit |
575503 |
split("", XMLATTR)
|
|
Packit |
575503 |
S0 = _XMLIO[file,"S0"]
|
|
Packit |
575503 |
XMLROW = _XMLIO[file,"line"];
|
|
Packit |
575503 |
XMLPATH = _XMLIO[file,"path"];
|
|
Packit |
575503 |
XMLDEPTH=_XMLIO[file,"depth"]+0;
|
|
Packit |
575503 |
dtd = _XMLIO[file,"dtd"];
|
|
Packit |
575503 |
while (!XMLEVENT) {
|
|
Packit |
575503 |
if (S0 == "") {
|
|
Packit |
575503 |
if (1 != (getline S0 < file))
|
|
Packit |
575503 |
break;
|
|
Packit |
575503 |
XMLROW ++;
|
|
Packit |
575503 |
S0 = S0 RS;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
if (mode == "") {
|
|
Packit |
575503 |
mline = XMLROW
|
|
Packit |
575503 |
accu=""
|
|
Packit |
575503 |
p = substr(S0,1,1)
|
|
Packit |
575503 |
if (p != "<" && !(dtd && p=="]"))
|
|
Packit |
575503 |
mode="CHARDATA"
|
|
Packit |
575503 |
else if (p == "]") {
|
|
Packit |
575503 |
S0 = substr(S0,2)
|
|
Packit |
575503 |
mode="ENDDOCT"
|
|
Packit |
575503 |
end=">"
|
|
Packit |
575503 |
dtd=0
|
|
Packit |
575503 |
} else if ( substr(S0,1,4) == "
|
|
Packit |
575503 |
S0=substr(S0,5)
|
|
Packit |
575503 |
mode="COMMENT"
|
|
Packit |
575503 |
end="-->"
|
|
Packit |
575503 |
} else if ( substr(S0,1,9) == "
|
|
Packit |
575503 |
S0 = substr(S0,10)
|
|
Packit |
575503 |
mode = "STARTDOCT"
|
|
Packit |
575503 |
end = ">"
|
|
Packit |
575503 |
} else if (substr(S0,1,9) == "
|
|
Packit |
575503 |
S0 = substr(S0,10)
|
|
Packit |
575503 |
mode = "CDA"
|
|
Packit |
575503 |
end = "]]>"
|
|
Packit |
575503 |
} else if ( substr(S0,1,2) == "
|
|
Packit |
575503 |
S0 = substr(S0,3)
|
|
Packit |
575503 |
mode = "DEC"
|
|
Packit |
575503 |
end = ">"
|
|
Packit |
575503 |
} else if (substr(S0,1,2) == "
|
|
Packit |
575503 |
S0 = substr(S0,3)
|
|
Packit |
575503 |
mode = "PROCINST"
|
|
Packit |
575503 |
end = "?>"
|
|
Packit |
575503 |
} else if ( substr(S0,1,2)=="</" ) {
|
|
Packit |
575503 |
S0 = substr(S0,3)
|
|
Packit |
575503 |
mode = "ENDELEM"
|
|
Packit |
575503 |
end = ">";
|
|
Packit |
575503 |
tag = S0
|
|
Packit |
575503 |
sub(/[ \n\r\t>].*$/,"",tag)
|
|
Packit |
575503 |
S0 = substr(S0,length(tag)+1)
|
|
Packit |
575503 |
ex = XMLPATH
|
|
Packit |
575503 |
sub(/\/[^\/]*$/,"",XMLPATH)
|
|
Packit |
575503 |
ex = substr(ex, length(XMLPATH)+2)
|
|
Packit |
575503 |
if (tag != ex) {
|
|
Packit |
575503 |
XMLERROR = "unexpected close tag <" ex ">..</" tag ">"
|
|
Packit |
575503 |
break
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
S0 = substr(S0,2)
|
|
Packit |
575503 |
mode = "STARTELEM"
|
|
Packit |
575503 |
tag = S0
|
|
Packit |
575503 |
sub(/[ \n\r\t\/>].*$/,"",tag)
|
|
Packit |
575503 |
S0 = substr(S0, length(tag)+1)
|
|
Packit |
575503 |
if (tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
|
|
Packit |
575503 |
XMLERROR = "invalid tag name '" tag "'"
|
|
Packit |
575503 |
break
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
XMLPATH = XMLPATH "/" tag;
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (mode == "CHARDATA") { # terminated by "<" or EOF
|
|
Packit |
575503 |
p = index(S0, "<")
|
|
Packit |
575503 |
if (dtd && (q=index(S0,"]")) && (!p || q
|
|
Packit |
575503 |
p = q
|
|
Packit |
575503 |
if (p) {
|
|
Packit |
575503 |
XMLEVENT = "CHARDATA"
|
|
Packit |
575503 |
XMLNAME = accu unescapeXML(substr(S0, 1, p-1))
|
|
Packit |
575503 |
S0 = substr(S0, p)
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
accu = accu unescapeXML(S0)
|
|
Packit |
575503 |
S0 = ""
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if ( mode == "STARTELEM" ) {
|
|
Packit |
575503 |
sub(/^[ \n\r\t]*/,"",S0)
|
|
Packit |
575503 |
if (S0 == "")
|
|
Packit |
575503 |
continue
|
|
Packit |
575503 |
if (substr(S0, 1, 2) == "/>" ) {
|
|
Packit |
575503 |
S0 = substr(S0, 3)
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
XMLEVENT = "STARTELEM"
|
|
Packit |
575503 |
XMLNAME = XMLSTARTELEM = tag
|
|
Packit |
575503 |
XMLDEPTH ++
|
|
Packit |
575503 |
S0 = "</" tag ">" S0
|
|
Packit |
575503 |
} else if (substr(S0, 1, 1) == ">" ) {
|
|
Packit |
575503 |
S0 = substr(S0, 2)
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
XMLEVENT = "STARTELEM"
|
|
Packit |
575503 |
XMLNAME = XMLSTARTELEM = tag
|
|
Packit |
575503 |
XMLDEPTH ++
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
att = S0
|
|
Packit |
575503 |
sub(/[= \n\r\t\/>].*$/,"",att)
|
|
Packit |
575503 |
S0 = substr(S0, length(att) + 1)
|
|
Packit |
575503 |
mode = "ATTR"
|
|
Packit |
575503 |
if (att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
|
|
Packit |
575503 |
XMLERROR = "invalid attribute name '" att "'"
|
|
Packit |
575503 |
break
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (mode == "ATTR") {
|
|
Packit |
575503 |
sub(/^[ \n\r\t]*/, "", S0)
|
|
Packit |
575503 |
if (S0 == "")
|
|
Packit |
575503 |
continue
|
|
Packit |
575503 |
if (substr(S0,1,1) == "=" ) {
|
|
Packit |
575503 |
S0 = substr(S0,2)
|
|
Packit |
575503 |
mode = "EQ"
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
XMLATTR[att] = att
|
|
Packit |
575503 |
mode = "STARTELEM"
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (mode == "EQ") {
|
|
Packit |
575503 |
sub(/^[ \n\r\t]*/,"",S0)
|
|
Packit |
575503 |
if (S0 == "")
|
|
Packit |
575503 |
continue
|
|
Packit |
575503 |
end = substr(S0,1,1)
|
|
Packit |
575503 |
if (end == "\"" || end == "'") {
|
|
Packit |
575503 |
S0 = substr(S0,2)
|
|
Packit |
575503 |
accu = ""
|
|
Packit |
575503 |
mode = "VALUE"
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
accu = S0
|
|
Packit |
575503 |
sub(/[ \n\r\t\/>].*$/,"", accu)
|
|
Packit |
575503 |
S0 = substr(S0, length(accu)+1)
|
|
Packit |
575503 |
XMLATTR[att] = unescapeXML(accu)
|
|
Packit |
575503 |
mode = "STARTELEM"
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (mode == "VALUE") { # terminated by end
|
|
Packit |
575503 |
if (p = index(S0, end)) {
|
|
Packit |
575503 |
XMLATTR[att] = accu unescapeXML(substr(S0,1,p-1))
|
|
Packit |
575503 |
S0 = substr(S0, p+length(end))
|
|
Packit |
575503 |
mode = "STARTELEM"
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
accu = accu unescapeXML(S0)
|
|
Packit |
575503 |
S0=""
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (mode == "STARTDOCT") { # terminated by "[" or ">"
|
|
Packit |
575503 |
if ((q = index(S0, "[")) && (!(p = index(S0,end)) || q
|
|
Packit |
575503 |
XMLEVENT = mode
|
|
Packit |
575503 |
XMLNAME = accu substr(S0, 1, q-1)
|
|
Packit |
575503 |
S0 = substr(S0, q+1)
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
dtd = 1
|
|
Packit |
575503 |
} else if (p = index(S0,end)) {
|
|
Packit |
575503 |
XMLEVENT = mode
|
|
Packit |
575503 |
XMLNAME = accu substr(S0, 1, p-1)
|
|
Packit |
575503 |
S0 = "]" substr(S0, p)
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
dtd = 1
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
accu = accu S0
|
|
Packit |
575503 |
S0 = ""
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
} else if (p = index(S0,end)) { # terminated by end
|
|
Packit |
575503 |
XMLEVENT = mode
|
|
Packit |
575503 |
XMLNAME = XMLENDELEM = ( mode=="ENDELEM" ? tag : accu substr(S0,1,p-1))
|
|
Packit |
575503 |
if (mode=="ENDELEM") XMLDEPTH --
|
|
Packit |
575503 |
S0 = substr(S0, p+length(end))
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
} else {
|
|
Packit |
575503 |
accu = accu S0
|
|
Packit |
575503 |
S0 = ""
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
_XMLIO[file, "S0"] = S0;
|
|
Packit |
575503 |
_XMLIO[file, "line"] = XMLROW;
|
|
Packit |
575503 |
_XMLIO[file, "path"] = XMLPATH;
|
|
Packit |
575503 |
_XMLIO[file, "depth"] = XMLDEPTH;
|
|
Packit |
575503 |
_XMLIO[file, "dtd"] = dtd;
|
|
Packit |
575503 |
if (mode == "CHARDATA") {
|
|
Packit |
575503 |
mode = ""
|
|
Packit |
575503 |
if (accu != "")
|
|
Packit |
575503 |
XMLEVENT = "CHARDATA"
|
|
Packit |
575503 |
XMLNAME = ""
|
|
Packit |
575503 |
$0 = accu
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
if (XMLEVENT) {
|
|
Packit |
575503 |
if (XMLEVENT == "STARTELEM") {
|
|
Packit |
575503 |
# Copy attributes into $0.
|
|
Packit |
575503 |
NF=0
|
|
Packit |
575503 |
for (ex in XMLATTR) {
|
|
Packit |
575503 |
NF ++
|
|
Packit |
575503 |
$NF = ex
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
return 1
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
close(file);
|
|
Packit |
575503 |
delete _XMLIO[file, "S0"];
|
|
Packit |
575503 |
delete _XMLIO[file, "line"];
|
|
Packit |
575503 |
delete _XMLIO[file, "path"];
|
|
Packit |
575503 |
delete _XMLIO[file, "depth"];
|
|
Packit |
575503 |
delete _XMLIO[file, "dtd"];
|
|
Packit |
575503 |
if (XMLERROR)
|
|
Packit |
575503 |
XMLERROR = file ":" XMLROW": " XMLERROR
|
|
Packit |
575503 |
else if (mode) XMLERROR=file ":" mline ": " "unterminated " mode
|
|
Packit |
575503 |
else if (XMLPATH) XMLERROR=file ":" XMLROW": " "unclosed tag(s) " XMLPATH
|
|
Packit |
575503 |
} # func. getXMLEVENT
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# unescape data and attribute values, used by getXMLEVENT
|
|
Packit |
575503 |
function unescapeXML(text) {
|
|
Packit |
575503 |
gsub( "'", "'", text )
|
|
Packit |
575503 |
gsub( """, "\"", text )
|
|
Packit |
575503 |
gsub( ">", ">", text )
|
|
Packit |
575503 |
gsub( "<", "<", text )
|
|
Packit |
575503 |
gsub( "&", "\\&", text)
|
|
Packit |
575503 |
return text
|
|
Packit |
575503 |
}
|
|
Packit |
575503 |
|
|
Packit |
575503 |
# close xml file
|
|
Packit |
575503 |
function closeXMLEVENT(file) {
|
|
Packit |
575503 |
close(file);
|
|
Packit |
575503 |
delete _XMLIO[file,"S0"]
|
|
Packit |
575503 |
delete _XMLIO[file,"line"]
|
|
Packit |
575503 |
delete _XMLIO[file,"path"];
|
|
Packit |
575503 |
delete _XMLIO[file,"depth"];
|
|
Packit |
575503 |
delete _XMLIO[file,"dtd"]
|
|
Packit |
575503 |
delete _XMLIO[file,"open"]
|
|
Packit |
575503 |
delete _XMLIO[file,"IND"]
|
|
Packit |
575503 |
}
|