Blame test/dtdgport.awk

Packit 575503
# dtdgport.awk
Packit 575503
# Reads an XML document from standard input and
Packit 575503
# prints a DTD for this document to standard output.
Packit 575503
# http://saxon.sourceforge.net/dtdgen.html
Packit 575503
# JK 2004-10-09
Packit 575503
# JK 2006-03-05
Packit 575503
# JK 2007-08-05 Converted for portability from the original dtd_generator.awk
Packit 575503
Packit 575503
# The program makes an internal list of all the elements
Packit 575503
# and attributes that appear in your document, noting how
Packit 575503
# they are nested, and noting which elements contain
Packit 575503
# character data.
Packit 575503
Packit 575503
BEGIN {
Packit 575503
  while (getXMLEVENT(ARGV[1])) {
Packit 575503
    # Remember each element.
Packit 575503
    if ( XMLEVENT == "STARTELEM" ) {
Packit 575503
      # Remember the parent names of each child node.
Packit 575503
      name[XMLDEPTH] = XMLSTARTELEM
Packit 575503
      if (XMLDEPTH>1)
Packit 575503
        child[name[XMLDEPTH-1], XMLSTARTELEM] ++
Packit 575503
      # Count how often the element occurs.
Packit 575503
      elem[XMLSTARTELEM] ++
Packit 575503
      # Remember all the attributes with the element.
Packit 575503
      for (a in XMLATTR)
Packit 575503
        attr[XMLSTARTELEM,a] ++
Packit 575503
    }
Packit 575503
  }
Packit 575503
}
Packit 575503
Packit 575503
END { print_elem(1, name[1]) }   # name[1] is the root
Packit 575503
Packit 575503
# Print one element (including sub-elements) but only once.
Packit 575503
function print_elem(depth, element,   c, atn, chl, n, i, myChildren) {
Packit 575503
  if (already_printed[element]++)
Packit 575503
    return 
Packit 575503
  indent=sprintf("%*s", 2*depth-2, "")
Packit 575503
  myChildren=""
Packit 575503
  for (c in child) {
Packit 575503
    split(c, chl, SUBSEP)
Packit 575503
    if (element == chl[1]) {
Packit 575503
      if (myChildren=="")
Packit 575503
        myChildren = chl[2]
Packit 575503
      else
Packit 575503
        myChildren = myChildren " | " chl[2]
Packit 575503
    }
Packit 575503
  }
Packit 575503
  # If an element has no child nodes, declare it as such.
Packit 575503
  if (myChildren=="")
Packit 575503
    print indent ""
Packit 575503
  else
Packit 575503
    print indent ""
Packit 575503
  # After the element name itself, list its attributes.
Packit 575503
  for (a in attr) {
Packit 575503
    split(a, atn, SUBSEP)
Packit 575503
    # Treat only those attributes that belong to the current element.
Packit 575503
    if (element == atn[1]) {
Packit 575503
      # If an attribute occured each time with its element, notice this. 
Packit 575503
      if (attr[element, atn[2]] == elem[element])
Packit 575503
        print indent ""
Packit 575503
      else
Packit 575503
        print indent ""
Packit 575503
    }
Packit 575503
  }
Packit 575503
  # Now go through the child nodes of this elements and print them.
Packit 575503
  gsub(/[\|]/, " ", myChildren)
Packit 575503
  n=split(myChildren, chl)
Packit 575503
  for(i=1; i<=n; i++) {
Packit 575503
    print_elem(depth+1, chl[i])
Packit 575503
    split(myChildren, chl)
Packit 575503
  }
Packit 575503
}
Packit 575503
Packit 575503
##
Packit 575503
# getXMLEVENT( file ): # read next xml-data into XMLEVENT,XMLNAME,XMLATTR
Packit 575503
#                      # referenced entities are not resolved
Packit 575503
# Parameters:
Packit 575503
#   file       -- path to xml file
Packit 575503
# External variables:
Packit 575503
#   XMLEVENT   -- type of item read, e.g. "STARTELEM"(tag), "ENDELEM"(end tag),
Packit 575503
#                 "COMMENT"(comment), "CHARDATA"(data)
Packit 575503
#   XMLNAME    -- value of item, e.g. tagname if type is "STARTELEM" or "ENDELEM"
Packit 575503
#   XMLATTR    -- Map of attributes, only set if XMLEVENT=="STARTELEM"
Packit 575503
#   XMLPATH    -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2
Packit 575503
#   XMLROW     -- current line number in input file
Packit 575503
#   XMLERROR   -- error text, set on parse error
Packit 575503
# Returns:
Packit 575503
#    1         on successful read: XMLEVENT, XMLNAME, XMLATTR are set accordingly
Packit 575503
#    ""        at end of file or parse error, XMLERROR is set on error
Packit 575503
# Private Data:
Packit 575503
#   _XMLIO     -- buffer, XMLROW, XMLPATH for open files
Packit 575503
##
Packit 575503
Packit 575503
function getXMLEVENT( file            ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) {
Packit 575503
    XMLEVENT=XMLNAME=XMLERROR=XMLSTARTELEM=XMLENDELEM = ""
Packit 575503
    split("", XMLATTR)
Packit 575503
    S0    = _XMLIO[file,"S0"]
Packit 575503
    XMLROW  = _XMLIO[file,"line"];
Packit 575503
    XMLPATH = _XMLIO[file,"path"];
Packit 575503
    XMLDEPTH=_XMLIO[file,"depth"]+0;
Packit 575503
    dtd   = _XMLIO[file,"dtd"];
Packit 575503
    while (!XMLEVENT) {
Packit 575503
        if (S0 == "") {
Packit 575503
            if (1 != (getline S0 < file))
Packit 575503
                break;
Packit 575503
             XMLROW ++;
Packit 575503
             S0 = S0 RS;
Packit 575503
        }
Packit 575503
        if (mode == "") {
Packit 575503
            mline = XMLROW
Packit 575503
            accu=""
Packit 575503
            p = substr(S0,1,1)
Packit 575503
            if (p != "<" && !(dtd && p=="]"))
Packit 575503
                mode="CHARDATA"
Packit 575503
            else if (p == "]") {
Packit 575503
                S0 = substr(S0,2)
Packit 575503
                mode="ENDDOCT"
Packit 575503
                end=">"
Packit 575503
                dtd=0
Packit 575503
            } else if ( substr(S0,1,4) == "
Packit 575503
                S0=substr(S0,5)
Packit 575503
                mode="COMMENT"
Packit 575503
                end="-->"
Packit 575503
            } else if ( substr(S0,1,9) == "
Packit 575503
                S0 = substr(S0,10)
Packit 575503
                mode = "STARTDOCT"
Packit 575503
                end  = ">"
Packit 575503
            } else if (substr(S0,1,9) == "
Packit 575503
                S0 = substr(S0,10)
Packit 575503
                mode = "CDA"
Packit 575503
                end = "]]>"
Packit 575503
            } else if ( substr(S0,1,2) == "
Packit 575503
                S0 = substr(S0,3)
Packit 575503
                mode = "DEC"
Packit 575503
                end = ">"
Packit 575503
            } else if (substr(S0,1,2) == "
Packit 575503
                S0 = substr(S0,3)
Packit 575503
                mode = "PROCINST"
Packit 575503
                end = "?>"
Packit 575503
            } else if ( substr(S0,1,2)=="</" ) {
Packit 575503
                S0 = substr(S0,3)
Packit 575503
                mode = "ENDELEM"
Packit 575503
                end = ">";
Packit 575503
                tag = S0
Packit 575503
                sub(/[ \n\r\t>].*$/,"",tag)
Packit 575503
                S0 = substr(S0,length(tag)+1)
Packit 575503
                ex = XMLPATH
Packit 575503
                sub(/\/[^\/]*$/,"",XMLPATH)
Packit 575503
                ex = substr(ex, length(XMLPATH)+2)
Packit 575503
                if (tag != ex) {
Packit 575503
                    XMLERROR = "unexpected close tag <" ex ">..</" tag ">"
Packit 575503
                    break
Packit 575503
                }
Packit 575503
            } else {
Packit 575503
                S0 = substr(S0,2)
Packit 575503
                mode = "STARTELEM"
Packit 575503
                tag = S0
Packit 575503
                sub(/[ \n\r\t\/>].*$/,"",tag)
Packit 575503
                S0 = substr(S0, length(tag)+1)
Packit 575503
                if (tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
Packit 575503
                    XMLERROR = "invalid tag name '" tag "'"
Packit 575503
                    break
Packit 575503
                }
Packit 575503
                XMLPATH = XMLPATH "/" tag;
Packit 575503
            }
Packit 575503
        } else if (mode == "CHARDATA") {                            # terminated by "<" or EOF
Packit 575503
            p = index(S0, "<")
Packit 575503
            if (dtd && (q=index(S0,"]")) && (!p || q
Packit 575503
                p = q
Packit 575503
            if (p) {
Packit 575503
                XMLEVENT = "CHARDATA"
Packit 575503
                XMLNAME = accu unescapeXML(substr(S0, 1, p-1))
Packit 575503
                S0 = substr(S0, p)
Packit 575503
                mode = ""
Packit 575503
            } else {
Packit 575503
                accu = accu unescapeXML(S0)
Packit 575503
                S0 = ""
Packit 575503
            }
Packit 575503
        } else if ( mode == "STARTELEM" ) {
Packit 575503
            sub(/^[ \n\r\t]*/,"",S0)
Packit 575503
            if (S0 == "") 
Packit 575503
                continue
Packit 575503
            if (substr(S0, 1, 2) == "/>" ) {
Packit 575503
                S0 = substr(S0, 3)
Packit 575503
                mode = ""
Packit 575503
                XMLEVENT = "STARTELEM"
Packit 575503
                XMLNAME = XMLSTARTELEM = tag
Packit 575503
                XMLDEPTH ++
Packit 575503
                S0 = "</" tag ">" S0
Packit 575503
            } else if (substr(S0, 1, 1) == ">" ) {
Packit 575503
                S0 = substr(S0, 2)
Packit 575503
                mode = ""
Packit 575503
                XMLEVENT = "STARTELEM"
Packit 575503
                XMLNAME = XMLSTARTELEM = tag
Packit 575503
                XMLDEPTH ++
Packit 575503
            } else {
Packit 575503
                att = S0
Packit 575503
                sub(/[= \n\r\t\/>].*$/,"",att)
Packit 575503
                S0 = substr(S0, length(att) + 1)
Packit 575503
                mode = "ATTR"
Packit 575503
                if (att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
Packit 575503
                    XMLERROR = "invalid attribute name '" att "'"
Packit 575503
                    break
Packit 575503
                }
Packit 575503
            }
Packit 575503
        } else if (mode == "ATTR") {
Packit 575503
            sub(/^[ \n\r\t]*/, "", S0)
Packit 575503
            if (S0 == "") 
Packit 575503
                continue
Packit 575503
            if (substr(S0,1,1) == "=" ) {
Packit 575503
                S0 = substr(S0,2)
Packit 575503
                mode = "EQ"
Packit 575503
            } else {
Packit 575503
                XMLATTR[att] = att
Packit 575503
                mode = "STARTELEM"
Packit 575503
            }
Packit 575503
        } else if (mode == "EQ") {
Packit 575503
            sub(/^[ \n\r\t]*/,"",S0)
Packit 575503
            if (S0 == "") 
Packit 575503
              continue
Packit 575503
            end = substr(S0,1,1)
Packit 575503
            if (end == "\"" || end == "'") {
Packit 575503
                S0 = substr(S0,2)
Packit 575503
                accu = ""
Packit 575503
                mode = "VALUE"
Packit 575503
            } else {
Packit 575503
                accu = S0
Packit 575503
                sub(/[ \n\r\t\/>].*$/,"", accu)
Packit 575503
                S0 = substr(S0, length(accu)+1)
Packit 575503
                XMLATTR[att] = unescapeXML(accu)
Packit 575503
                mode = "STARTELEM"
Packit 575503
            } 
Packit 575503
        } else if (mode == "VALUE") {                          # terminated by end
Packit 575503
            if (p = index(S0, end)) {
Packit 575503
                XMLATTR[att] = accu unescapeXML(substr(S0,1,p-1))
Packit 575503
                S0 = substr(S0, p+length(end))
Packit 575503
                mode = "STARTELEM"
Packit 575503
            } else {
Packit 575503
                accu = accu unescapeXML(S0)
Packit 575503
                S0=""
Packit 575503
            }
Packit 575503
        } else if (mode == "STARTDOCT") {                      # terminated by "[" or ">"
Packit 575503
            if ((q = index(S0, "[")) && (!(p = index(S0,end)) || q
Packit 575503
                XMLEVENT = mode
Packit 575503
                XMLNAME = accu substr(S0, 1, q-1)
Packit 575503
                S0 = substr(S0, q+1)
Packit 575503
                mode = ""
Packit 575503
                dtd = 1
Packit 575503
            } else if (p = index(S0,end)) {
Packit 575503
                XMLEVENT = mode
Packit 575503
                XMLNAME = accu substr(S0, 1, p-1)
Packit 575503
                S0 = "]" substr(S0, p)
Packit 575503
                mode = ""
Packit 575503
                dtd = 1
Packit 575503
            } else {
Packit 575503
                accu = accu S0
Packit 575503
                S0 = ""
Packit 575503
            }
Packit 575503
        } else if (p = index(S0,end)) {  # terminated by end
Packit 575503
            XMLEVENT = mode
Packit 575503
            XMLNAME = XMLENDELEM = ( mode=="ENDELEM" ? tag : accu substr(S0,1,p-1))
Packit 575503
            if (mode=="ENDELEM") XMLDEPTH --
Packit 575503
            S0 = substr(S0, p+length(end))
Packit 575503
            mode = ""
Packit 575503
        } else {
Packit 575503
            accu = accu S0
Packit 575503
            S0 = ""
Packit 575503
        }
Packit 575503
    }
Packit 575503
    _XMLIO[file, "S0"]   = S0;
Packit 575503
    _XMLIO[file, "line"] = XMLROW;
Packit 575503
    _XMLIO[file, "path"] = XMLPATH; 
Packit 575503
    _XMLIO[file, "depth"] = XMLDEPTH; 
Packit 575503
    _XMLIO[file, "dtd"]  = dtd;
Packit 575503
    if (mode == "CHARDATA") {
Packit 575503
        mode = ""
Packit 575503
        if (accu != "")
Packit 575503
            XMLEVENT = "CHARDATA"
Packit 575503
        XMLNAME = ""
Packit 575503
        $0 = accu
Packit 575503
    }
Packit 575503
    if (XMLEVENT) {
Packit 575503
        if (XMLEVENT == "STARTELEM") {
Packit 575503
            # Copy attributes into $0.
Packit 575503
            NF=0
Packit 575503
            for (ex in XMLATTR) {
Packit 575503
                NF ++
Packit 575503
                $NF = ex
Packit 575503
            }
Packit 575503
        }
Packit 575503
        return 1
Packit 575503
    }
Packit 575503
    close(file);
Packit 575503
    delete _XMLIO[file, "S0"];
Packit 575503
    delete _XMLIO[file, "line"];
Packit 575503
    delete _XMLIO[file, "path"]; 
Packit 575503
    delete _XMLIO[file, "depth"]; 
Packit 575503
    delete _XMLIO[file, "dtd"];
Packit 575503
    if (XMLERROR)
Packit 575503
        XMLERROR = file ":" XMLROW": " XMLERROR 
Packit 575503
    else if (mode) XMLERROR=file ":" mline ": " "unterminated " mode
Packit 575503
    else if (XMLPATH) XMLERROR=file ":" XMLROW": "  "unclosed tag(s) " XMLPATH 
Packit 575503
} # func. getXMLEVENT
Packit 575503
Packit 575503
# unescape data and attribute values, used by getXMLEVENT
Packit 575503
function unescapeXML(text) {
Packit 575503
    gsub( "'", "'",  text )
Packit 575503
    gsub( """, "\"", text )
Packit 575503
    gsub( ">",   ">",  text )
Packit 575503
    gsub( "<",   "<",  text )
Packit 575503
    gsub( "&",  "\\&",  text)
Packit 575503
    return text
Packit 575503
}
Packit 575503
Packit 575503
# close xml file
Packit 575503
function closeXMLEVENT(file) {
Packit 575503
    close(file);
Packit 575503
    delete _XMLIO[file,"S0"]
Packit 575503
    delete _XMLIO[file,"line"]
Packit 575503
    delete _XMLIO[file,"path"];
Packit 575503
    delete _XMLIO[file,"depth"];
Packit 575503
    delete _XMLIO[file,"dtd"]
Packit 575503
    delete _XMLIO[file,"open"]
Packit 575503
    delete _XMLIO[file,"IND"]
Packit 575503
}