From d27c7ec42518604f6b8c0ef08a01c99b5bb7efca Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 16 2020 14:19:19 +0000 Subject: perl-libxml-perl-0.08 base --- diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..4783fe1 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,284 @@ +2003-10-21 Ken MacLeod + + * t/stream.t: fixed test 11 for Perl >= 5.6 + +2001-07-23 Ken MacLeod + + * lib/XML/SAX2Perl.pm (startElement): typo; reported by + mhershb@mcdermott.com (Mark A. Hershberger) + +2000-03-30 Ken MacLeod + + * doc/index.html (Contributors): added Clark Cooper + + * MANIFEST (doc/sax-2.0.html, doc/sax-2.0-adv.html): added + +2000-03-20 Ken MacLeod + + * lib/Data/Grove/Visitor.pm (_children_accept_name): add return + @return; reported by Laurent CAPRANI + +2000-03-07 Ken MacLeod + + * doc/sax-2.0.html, doc/sax-2.0-adv.html: added + +2000-03-02 Ken MacLeod + + * lib/XML/ESISParser.pm: add -E0 to NSGMLS_FLAGS to not limit the + number of errors reported; suggested by Charles Thayer + + (parse_fh): report line and line number on command character + errors; also suggested by Charles + +2000-02-22 Ken MacLeod + + * doc/index.html (Contributors): added Michael Koehne, KangChan + Lee, and Colin Muller + + * doc/mirror.sh, doc/index.html: added + +2000-02-17 Ken MacLeod + + * doc/modules.xml: fixed several well-formedness errors; reported + by KangChan Lee + +1999-12-22 Ken MacLeod + + * lib/Data/Grove.pm, lib/Data/Grove/Parent.pm, + lib/Data/Grove/Visitor.pm, lib/XML/Handler/XMLWriter.pm, + lib/XML/Handler/CanonXMLWriter.pm, lib/XML/Handler/Subs.pm, + lib/XML/SAX2Perl.pm, lib/XML/Perl2SAX.pm, lib/XML/ESISParser.pm, + lib/XML/Parser/PerlSAX.pm, lib/XML/PatAct/Amsterdam.pm, + lib/XML/PatAct/MatchName.pm, lib/XML/PatAct/ToObjects.pm: added + $VERSION + + * lib/XML/Parser/PerlSAX.pm (_handle_start): support + UseAttributeOrder option + (_handle_attlist): Changed EntityName to ElementName (re 9/28 + entry) + +1999-09-28 Ken MacLeod + + * lib/XML/Parser/PerlSAX.pm (_handle_attlist): typo: was calling + entity_decl + +1999-09-09 Ken MacLeod + + * lib/XML/Parser/PerlSAX.pm: add start_cdata, end_cdata, and + entity_reference events + +1999-08-28 Ken MacLeod + + * lib/XML/PatAct/Amsterdam.pm: added Output and AsString options, + added support for attribute replacement + + * t/amsterdam.t: added + +1999-08-18 Ken MacLeod + + * lib/Data/Grove.pm: added Data::Grove::Characters + + * lib/XML/ESISParser.pm (parse_fh): report newline as characters + if no record_end() handler + + * lib/XML/PatAct/ToObjects.pm (_parse_action): removed debugging + statement + +1999-08-16 Ken MacLeod + + * README: updated + + * doc/modules.xml (libxml-perl): updated + + * doc/PerlSAX.pod (Parameters): missing '>' + + * release 0.05 + + * lib/XML/Parser/PerlSAX.pm (_handle_init): call set_document_locator + + * lib/XML/PatAct/ActionTempl.pm, lib/XML/PatAct/Amsterdam.pm, + lib/XML/PatAct/MatchName.pm, lib/XML/PatAct/PatternTempl.pm (new): + Accept both key, value pairs and hash options + + * lib/XML/PatAct/ToObjects.pm (new): + + * lib/XML/Handler/Subs.pm: added + + * t/subs.t: added + + * t/stream.t: added + +1999-08-15 Ken MacLeod + + * lib/XML/Handler/XMLWriter.pm: added + + * lib/XML/Handler/Sample.pm: Placed in public domain + +1999-08-14 Ken MacLeod + + * doc/PerlSAX.pod: added an introduction, a ``Deviations from the Java version'' section, added `set_document_locator()' handler method + + * lib/XML/PatAct/ToObjects.pm: add CopyAttributes option, add + -grove-contents action + +1999-08-12 Ken MacLeod + + * lib/XML/ESISParser.pm (parse_fh): dynamically test event handler + existance + + * lib/XML/Parser/PerlSAX.pm (parse): wasn't capturing XML::Parser + Element events + +1999-08-10 Ken MacLeod + + * README, doc/modules.xml: updated with PatAct modules + + * lib/XML/PatAct/ActionTempl.pm, lib/XML/PatAct/Amsterdam.pm, + lib/XML/PatAct/MatchName.pm, lib/XML/PatAct/PatternTempl.pm, + lib/XML/PatAct/ToObjects.pm: added + + * t/xp_sax.t, t/canon_xml_writer.t: added CVS ID + + * t/schema.t: added + + * examples/schema.xml, examples/schema.pl: added + + * doc/UsingPatActModules.pod, doc/CreatingPatActModules.pod: added + + * lib/XML/Parser/PerlSAX.pm (_handle_extern_ent): change "Perl + SAX" to "PerlSAX" in doc + +1999-08-09 Ken MacLeod + + * lib/XML/ESISParser.pm (parse_fh): was not passing an empty hash + + * lib/XML/Parser/PerlSAX.pm (_handle_init, _handle_final): was not + passing an empty hash + +1999-05-26 Ken MacLeod + + * lib/XML/Handler/CanonXMLWriter.pm, t/canon_xml_writer.t: added + +1999-05-23 Ken MacLeod + + * lib/Data/Grove/Tied.pm: renamed to Parent.pm + + * README (DOCUMENTS): added + renamed libxml to libxml-perl + + * libxml.spec: renamed libxml-perl.spec + +1999-05-17 Ken MacLeod + + * libxml.spec: files in `doc/' go into top-dir of /usr/doc/$PKG + + * PerlSAX.pod: moved to doc/PerlSAX.pod + +1999-05-09 Ken MacLeod + + * doc/modules.xml: added + +1999-05-08 Ken MacLeod + + * doc/UsingPerlSAX.pod, examples/MyHandler.pm, + examples/myhandler.pl, examples/myhandler.xml: added + +1999-05-07 Ken MacLeod + + * lib/XML/ESISParser.pm, lib/Data/Grove.pm, + lib/XML/Handler/Sample.pm: added POD + +1999-05-06 Ken MacLeod + + * lib/Data/Grove/Visitor.pm: remove XML::Grove extensions and make + generic + + * lib/XML/Parser/SAXPerl.pm: renamed PerlSAX.pm + + * lib/XML/Handler/Sample.pm: added + + * examples/perlsax-test.pl: added + + * examples/esis-test.pl: updated for new XML::ESISParser, moved + handler (Receiver) to XML::Handler::Sample, added command line + option for SGML + +1999-04-30 Ken MacLeod + + * Makefile.PL: added PREREQ_PM for XML::Parser + +1999-04-15 Ken MacLeod + + * lib/Data/Grove/Visitor.pm (accept): change XML:: to Data:: + + * lib/Data/Grove.pm (new): %{ shift } was being read as %shift + +1999-02-18 Ken MacLeod + + * lib/Data/Grove/Visitor.pm: was XML::Grove::Visitor + + * lib/Data/Grove/Tied.pm: was XML::Grove::Node + + * lib/Data/Grove.pm: created from XML::Grove + +1999-02-15 Ken MacLeod + + * lib/XML/Parser/SAXPerl.pm (parse): add comments + + * lib/XML/ESISParser.pm: major changes for support of both XML and + SGML, and ongoing Perl SAX updates + + * SAX.pod (end_document): noted that the return value of + end_document() is the return value of parse() + + * README: added reference to FAQ, added module statuses, more + cleary described ESISParser, require Perl 5.005 + +1999-02-13 Ken MacLeod + + * lib/XML/ESISParser.pm: start move to Perl SAX + +1999-02-12 Ken MacLeod + + * lib/XML/SAX2Perl.pm, lib/XML/Perl2SAX.pm, lib/XML/ESISParser.pm: + update to new Perl SAX + + * lib/XML/Parser/SAXPerl.pm (new): allow hash or key/value pairs + +1999-02-12 Ken MacLeod + + * interface-style.pod: note still undecided items + + * lib/XML/Parser/SAXPerl.pm: fixes shown by xp_sax.t + + * t/xp_sax.t: added + + * lib/XML/Parser/SAXPerl.pm: added pod + many changes for Perl SAX and XML::Parser::Expat + +1999-02-11 Ken MacLeod + + * SAX.pod: suggestions from Eric Prud'hommeaux and Enno Derksen + + * interface-style.pod: suggestions from Larry Wall + +1999-02-01 Ken MacLeod + + * MANIFEST: updated + + * lib/XML/Parser/SAXPerl.pm: modified more towards Perl SAX + + * SAX.pod: added + +1999-01-31 Ken MacLeod + + * interface-style.pod: added + +1998-12-10 Ken MacLeod + + * lib/XML/Parser/SAXPerl.pm: added + +1998-12-08 Ken MacLeod + + * MANIFEST: added + diff --git a/Changes b/Changes new file mode 100644 index 0000000..93e79d8 --- /dev/null +++ b/Changes @@ -0,0 +1,113 @@ +Revision history for Perl extension libxml + +Backwards incompatible changes are marked with a `*'. + +ToDo + - XML::ESISParser: include Robert Braddock's update for OpenSP, + in email 25Jul + - XML::Parser::PerlSAX doesn't pass ParseParamEnt to + XML::Parser, inspired by a request by Paul Mahoney + + - switch Data::Grove::Visitor to use UNIVERSAL::can instead of + $self->{'has'}, suggested by Mike Richardson + + - no modules are yet supporting SAX2 + - XML::Parser::PerlSAX doesn't implement ErrorHandler, it + should at least call fatal_error() if XML::Parser dies; + reported by Craig N. Caroon + +0.08 Tue Oct 21 10:54:18 CDT 2003 + - added Perl SAX 2.0 Binding + - XML::ESISParser: add -E0 to nsgmls options so that nsgmls + doesn't quit after 200 errors. Add more detail to command + character error message. Suggested by Charles Thayer + . + - fixes + - Data::Grove::Visitor: children_accept_name was not + returning any data in some cases; reported by Laurent + CAPRANI + - XML::SAX2Perl: typo in startElement; reported by Mark + A. Hershberger + - t/stream.t Test 11 fails due to 8-bit characters on Perl + 5.6, first reported by Ed Arnold + +0.07 Tue Feb 22 14:24:52 CST 2000 + - doc/index.html: libxml-perl site index + - doc/mirror.sh: creates a libxml-perl mirror site + - fixes + - all modules: release script didn't insert version numbers + in Perl modules. Reported by Enno Derksen + - doc/modules.xml: well-formedness errors. Reported by + KangChan Lee + +0.06 Wed Dec 22 15:14:39 CST 1999 + - all modules: add $VERSION. Suggested by Michael Koehne + + - XML::Parser::PerlSAX: add UseAttributeOrder option and + AttributeOrder and Defaulted properties to start_element() + handler. Suggested by Enno Derksen + - XML::Parser::PerlSAX: add start_cdata, end_cdata, and + entity_reference events + - XML::PatAct::Amsterdam: added Output and AsString options, + added support for replacing attributes + - Data::Grove: add a Data::Grove::Characters class to act as a + default grove object for containing characters. + - fixes + - XML::PatAct::ToObjects: removed leftover debugging statement + - XML::ESISParser: report record end as characters if no + record_end() handler + - XML::Parser::PerlSAX: For attribute list declarations, now + correctly calls the attlist_decl() method and passes the + ElementName property, it used to call entity_decl() + passing EntityName. Reported by Enno Derksen + and Colin Muller + +0.05 Mon Aug 16 11:02:32 CDT 1999 + - Major update to PerlSAX.pod + - added an introduction + - added a ``Deviations from the Java version'' section + * re-added the `set_document_locator()' handler method + - added arguments to method synopses + - attributed most of the content to the SAX 1.0 JavaDoc + - minor typos + - XML::Handler::XMLWriter: a new PerlSAX handler for writing + readable XML (in contrast to Canonical XML) + - XML::Handler::Subs: a new PerlSAX handler base class for + calling user-defined subs + - XML::Handler::Sample: this is a template for creating + PerlSAX handlers, it is now in the Public Domain + - XML::PatAct::ToObjects: add CopyAttributes option, add + -grove-contents option + - all PatAct modules can now take parameters as either a list + of key, value pairs or a hash + - fixes + - XML::ESISParser wasn't testing handlers for what methods + they support + - XML::Parser::PerlSAX wasn't capturing XML::Parser Element + events + +0.04 Wed Aug 11 10:03:00 CDT 1999 + - README: updated with PatAct modules + - added Creating PatAct Modules and Using PatAct Modules docs + - added XML::PatAct::ActionTempl, XML::PatAct::Amsterdam, + XML::PatAct::MatchName, XML::PatAct::PatternTempl, + XML::PatAct::ToObjects + - added schema.pl and schema.xml examples + - added schema.t test + - fixes + - XML::Parser::PerlSAX and XML::ESISParser were not passing + a hash for start_document() or end_document() per spec + - t/canon_xml_writer.t, t/xp_sax.t: added CVS ID + +0.03 Wed May 26 19:49:46 CDT 1999 + - added XML::Handler::CanonXMLWriter and test + +0.02 Mon May 24 18:02:00 CDT 1999 + - renamed package from `libxml' to `libxml-perl' + - added doc/modules.xml + - added doc/UsingPerlSAX.pod and example files + - moved PerlSAX.pod and interface-style.pod to `doc/' + - renamed Data::Grove::Tied to Data::Grove::Parent + +0.01 Fri May 7 14:59:07 CDT 1999 + - original version diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..3cfe5b1 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,46 @@ +ChangeLog +Changes +MANIFEST +Makefile.PL +README +libxml-perl.spec +libxml-perl-0.08.spec +doc/CreatingPatActModules.pod +doc/PerlSAX.pod +doc/UsingPatActModules.pod +doc/UsingPerlSAX.pod +doc/index.html +doc/interface-style.pod +doc/mirror.sh +doc/modules.xml +doc/sax-2.0.html +doc/sax-2.0-adv.html +lib/Data/Grove.pm +lib/Data/Grove/Parent.pm +lib/Data/Grove/Visitor.pm +lib/XML/ESISParser.pm +lib/XML/Perl2SAX.pm +lib/XML/SAX2Perl.pm +lib/XML/Handler/CanonXMLWriter.pm +lib/XML/Handler/Sample.pm +lib/XML/Handler/Subs.pm +lib/XML/Handler/XMLWriter.pm +lib/XML/Parser/PerlSAX.pm +lib/XML/PatAct/ActionTempl.pm +lib/XML/PatAct/Amsterdam.pm +lib/XML/PatAct/MatchName.pm +lib/XML/PatAct/PatternTempl.pm +lib/XML/PatAct/ToObjects.pm +examples/MyHandler.pm +examples/esis-test.pl +examples/myhandler.pl +examples/myhandler.xml +examples/perlsax-test.pl +examples/schema.pl +examples/schema.xml +t/amsterdam.t +t/canon_xml_writer.t +t/schema.t +t/stream.t +t/subs.t +t/xp_sax.t diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..2bee5dd --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,20 @@ +# +# Copyright (C) 1998 Ken MacLeod +# This library is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# $Id: Makefile.PL,v 1.3 1999/05/24 23:25:02 kmacleod Exp $ +# + +use ExtUtils::MakeMaker; + +$VERSION = '0.08'; + +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + 'NAME' => 'libxml-perl', + 'VERSION_FROM' => 'Makefile.PL', + 'PREREQ_PM' => { 'XML::Parser' => '2.19' }, + dist => {'COMPRESS' => 'gzip', 'SUFFIX' => '.gz'}, +); diff --git a/README b/README new file mode 100644 index 0000000..5ba4fec --- /dev/null +++ b/README @@ -0,0 +1,171 @@ +$Id: README,v 1.10 2003/10/21 16:01:54 kmacleod Exp $ + + + libxml-perl + + Collection of Perl modules for working with XML. + + +INTRODUCTION + + libxml-perl is a collection of smaller Perl modules, scripts, and + documents for working with XML in Perl. libxml-perl software + works in combination with XML::Parser, PerlSAX, XML::DOM, + XML::Grove and others. + + See the file Changes for user-visible changes and ChangeLog for + detailed changes. See the `examples' directory for examples. POD + style documentation is included in all non-alpha modules and + scripts. You should also be able to use the 'perldoc' utility to + extract documentation from the module files directly. HTML + formatted docs are available at the libxml-perl home page + . + + Newer versions of this module can be found on CPAN at + . To join the + Perl-XML mailing list, send an email message to + ListManager@ActiveState.com with the following text in the body: + Subscribe Perl-XML + + View the Perl XML FAQ at + . + + Copyright (C) 1998 Ken MacLeod and others + This library is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + + +MODULES + + The following modules are marked with their release status: + + STABLE -- has been in use for a while with few or no outstanding + bugs + BETA -- interfaces are stable but there may still be bugs + ALPHA -- interfaces are changing, there may be lots of bugs, and + there may not be docs available yet + + XML::Parser::PerlSAX STABLE + XML::Parser::PerlSAX is a PerlSAX parser using XML::Parser + (which uses James Clark's Expat XML Parser). + + XML::Handler::Sample STABLE + XML::Handler::Sample is a PerlSAX handler that simply prints + out the event names as they are parsed by a PerlSAX parser. + It can be used for debugging or as a template for building new + handlers. XML::Handler::Sample contains handlers for all + known parser events. + + XML::ESISParser STABLE + XML::ESISParser is a validating PerlSAX parser using James + Clark's `nsgmls' SGML/XML Parser. ESISParser supports both + XML and SGML document instances. Unless you need validation, + you should probably be using XML::Parser::PerlSAX or + XML::Parser. + + XML::ESISParser with XML::Grove obsolete the + SGML::SPGroveBuilder and SGML::Grove modules. + + XML::Handler::XMLWriter STABLE + A PerlSAX handler for writing readable XML (in contrast to + Canonical XML, for example). XMLWriter is also subclassable + and supports calling start and end methods by element-names + (subclassed from XML::Handler::Subs). XMLWriter is similar to + XML::Parser's Stream style. + + XML::Handler::Subs STABLE + A PerlSAX handler base class that calls start and end methods + by element-names. Subs is similar to XML::Parser's Subs + style. + + XML::Handler::CanonXMLWriter STABLE + A PerlSAX handler that outputs in Canonical XML + . This module is + generally only used for debugging. + + Data::Grove STABLE + Data::Grove::Parent STABLE + Data::Grove::Visitor STABLE + Data::Grove and it's helpers provide a base class for deeply + nested or directed graph structures. Used by XML::Grove (and + others soon). + + XML::SAX2Perl ALPHA + XML::Perl2SAX ALPHA + SAX2Perl and Perl2SAX are SAX Parser<->DocumentHandler + filters. These modules translate parse events between the + Java/CORBA style SAX methods and PerlSAX style methods. + + XML::PatAct::MatchName ALPHA + MatchName is a pattern matching module that can be used with + PatAct action modules. MatchName uses simple element names or + element name lists to match names to actions. + + XML::PatAct::ToObjects ALPHA + ToObjects is a PatAct action module. ToObjects can be used to + create application-ready Perl objects from XML instances. + + XML::PatAct::Amsterdam ALPHA + Amsterdam is a PatAct action module. Amsterdam can be used to + apply a very simple form of style-sheet to an XML instance by + using ``before'' and ``after'' strings that are output before + and after the contents of elements. + + XML::PatAct::PatternTempl BETA + XML::PatAct::ActionTempl BETA + PatternTempl and ActionTempl are template files that + pattern/action module writers can copy to create new modules. + See Creating PatAct Modules for more information. + +DOCUMENTS + + sax-2.0.html, sax-2.0-adv.html + PerlSAX 2.0 bindings. Maintained by Robin Berjon and the + XML-Perl mailing list. + + PerlSAX + This document defines a Perl binding to SAX 1.0. PerlSAX- + based parser modules implement and possibly extend the + interface described in PerlSAX. + + UsingPerlSAX + A brief introduction to PerlSAX using the XML::Parser::PerlSAX + module. + + UsingPatActModules + Describes how to use pattern/action modules to transform XML + instances. + + CreatingPatActModules + A document for module writers who are writing new pattern/ + action modules. + + modules.xml + modules.xml contains a listing of all Perl XML packages and + their public modules categorized by several topics. + + +INSTALLATION + + In order to use this package you will need Perl version 5.005 or + better. Several other modules may also be required to use some + modules in libxml-perl, including XML::Parser, XML::DOM, and + XML::Grove. These are all available in the XML module directory + on CPAN. + + + + You install libxml-perl, as you would install any perl module + library, by running these commands: + + perl Makefile.PL + make + make test + make install + + If you want to install a private copy of libxml-perl in your home + directory, then you should try to produce the initial Makefile + with something like this command: + + perl Makefile.PL PREFIX=~/perl + diff --git a/doc/CreatingPatActModules.pod b/doc/CreatingPatActModules.pod new file mode 100644 index 0000000..6eac7fc --- /dev/null +++ b/doc/CreatingPatActModules.pod @@ -0,0 +1,76 @@ +=head1 Creating PatAct Modules + +This document is targeted towards the module writer creating a new +pattern or action module or readers who want to understand what is +going on inside a pattern or action module. If you are only +interesting in using PatAct modules, please see ``Using PatAct +Modules.'' + +There are two types of modules involved in processing a pattern-action +list the pattern module and the action module. Pattern modules are +created by users and passed to the `new()' method of action modules, +otherwise all pattern module methods are used only by the action +module. Action modules are PerlSAX handlers (see PerlSAX.pod in +libxml-perl). Action modules are responsible for initializing the +pattern module, receiving PerlSAX events, calling the `match()' method +in the pattern module for each element, and applying actions for +matching elements. + +The interface the user uses to call the drivers is described in +``Using PatAct Modules''. + +In general, the pattern-action modules perform their work on an +element-by-element basis, but the action modules are called with +PerlSAX events for all parse events (characters, processing +instructions, etc.). + +=head1 Pattern Modules + +Pattern modules have this interface, where PATTERN is the pattern or +query implementation: + + use XML::PatAct::PATTERN; + + $matcher = XML::PatAct::PATTERN->new(Patterns => $patterns [, OPTIONS]); + $matcher->initialize($actor); + $index = $matcher->match($element, $names, $nodes); + $matcher->finalize(); + +A pattern module instance is created with the pattern list that will +be used or processing as well as any additional options a pattern +module may define. `$patterns' is the original array reference passed +in by the user to the action module, so it is made up of pairs of +PATTERN => ACTION. The pattern matcher should ignore the ACTION +items. + +`initialize()' is called before any calls to `match()'. `$actor' is +the action module that is calling the pattern module. `initialize()' +is normally called from the `start_document()' PerlSAX event. + +`match()' performs a single matching against the pattern list and +returns the index of the matching pattern or undef if no pattern +matches. `$element' is the element to match. `$names' and `$nodes' +are array references containing the names and nodes (hashes) of this +element and all parent elements up to the element where processing +started. + +`finalize()' is called at the end of processing and may be used to +release state information. `finalize()' is normally called from the +`end_document()' PerlSAX event. + +Here is a template for creating a pattern module: + +@include ../lib/XML/PatAct/PatternTempl.pm + +=head1 Action Modules + +Action modules are PerlSAX handlers (see PerlSAX.pod in libxml-perl). +Action modules are responsible for initializing the pattern module, +receiving PerlSAX events, calling the `match()' method in the pattern +module for each element, and applying actions for matching elements. +Action modules must also maintain arrays of element names and element +nodes to be passed to the `match()' method. + +Here is a template for creating an action module: + +@include ../lib/XML/PatAct/ActionTempl.pm diff --git a/doc/PerlSAX.pod b/doc/PerlSAX.pod new file mode 100644 index 0000000..9e6170d --- /dev/null +++ b/doc/PerlSAX.pod @@ -0,0 +1,643 @@ +=head1 SAX for Perl + +=head2 What is SAX? + +SAX (Simple API for XML) is a common parser interface for XML +parsers. It allows application writers to write applications that use +XML parsers, but are independent of which parser is actually used. + +This document describes a version of SAX used by Perl modules. The +original version of SAX, for Java, is described at +. + +There are two basic interfaces in the Perl version of SAX, the parser +interface and the handler interface. The parser interface creates new +parser instances, initiates parsing, and provides additional +information to handlers on request. The handler interface is used to +receive parse events from the parser. + +=head2 Deviations from the Java version + +=over 4 + +=item * + +Takes parameters to `C' instead of using `set*' calls. + +=item * + +Allows a default Handler parameter to be used for all handlers. + +=item * + +No base classes are implemented. Instead, parsers dynamically check +the handlers for what methods they support. + +=item * + +The AttributeList, InputSource, and SAXException classes have been +replaced by anonymous hashes. + +=item * + +Handlers are passed a hash containing properties as an argument in +place of positional arguments. + +=item * + +`C' returns the value returned by calling the +`C' handler. + +=item * + +Method names have been converted to lower-case with underscores. +Parameters are all mixed case with initial upper-case. + +=back + +=head1 Parser Interface + +SAX parsers are reusable but not re-entrant: the application may reuse +a parser object (possibly with a different input source) once the +first parse has completed successfully, but it may not invoke the +`C' methods recursively within a parse. + +Parser objects contain the following options. A new or different +handler option may provided in the middle of a parse, and the SAX +parser must begin using the new handler immediately. The `C' +option must not be changed in the middle of a parse. If an +application does not provide a handler for a particular set of events, +those events will be silently ignored unless otherwise stated. If an +`C' is not provided, the parser will resolve system +identifiers and open connections to entities itself. + + Handler default handler to receive events + DocumentHandler handler to receive document events + DTDHandler handler to receive DTD events + ErrorHandler handler to receive error events + EntityResolver handler to resolve entities + Locale locale to provide localisation for errors + +If no handlers are provided then all events will be silently ignored, +except for `C' which will cause a `C' to be +called after calling `C'. + +All handler methods are called with a single hash argument containing +the parameters for that method. `C' methods can be called with +a hash or a list of key-value pairs containing the parameters. + +All SAX parsers must implement this basic interface: it allows +applications to provide handlers for different types of events and to +initiate a parse from a URI, a byte stream, or a character stream. + +=over 4 + +=item new( I ) + +Creates a Parser that will be used to parse XML sources. Any +parameters passed to `C' will be used for subsequent parses. +I may be a list of key, value pairs or a hash. + +=item parse( I ) + +Parse an XML document. + +The application can use this method to instruct the SAX parser to +begin parsing an XML document from any valid input source (a character +stream, a byte stream, or a URI). I may be a list of key, +value pairs or a hash. I passed to `C' override +options given when the parser instance was created with `C'. + +Applications may not invoke this method while a parse is in progress +(they should create a new Parser instead for each additional XML +document). Once a parse is complete, an application may reuse the same +Parser object, possibly with a different input source. + +`C' returns the result of calling the handler method +`C'. + +A `C' parameter must have been provided to either the +`C' or `C' methods. The `C' parameter is a +hash containing the following parameters: + +=over 4 + +=item PublicId + +The public identifier for this input source. + +The public identifier is always optional: if the application writer +includes one, it will be provided as part of the location information. + +=item SystemId + +The system identifier for this input source. + +The system identifier is optional if there is a byte stream, a +character stream, or a string, but it is still useful to provide one, +since the application can use it to resolve relative URIs and can +include it in error messages and warnings (the parser will attempt to +open a connection to the URI only if there is no byte stream or +character stream specified). + +If the application knows the character encoding of the object pointed +to by the system identifier, it can provide the encoding using the +`C' parameter. + +If the system ID is a URL, it must be fully resolved. + +=item String + +A scalar value containing XML text to be parsed. + +The SAX parser will ignore this if there is also a byte or character +stream, but it will use a string in preference to opening a URI +connection. + +=item ByteStream + +The byte stream (file handle) for this input source. + +The SAX parser will ignore this if there is also a character stream +specified, but it will use a byte stream in preference to opening a +URI connection itself or using `C'. + +If the application knows the character encoding of the byte stream, it +should set it with the `C' parameter. + +=item CharacterStream + +FOR FUTURE USE ONLY -- Perl does not currently support any character +streams, only use the `C', `C', or `C' +parameters. + +The character stream (file handle) for this input source. + +If there is a character stream specified, the SAX parser will ignore +any byte stream and will not attempt to open a URI connection to the +system identifier. + +=item Encoding + +The character encoding, if known. + +The encoding must be a string acceptable for an XML encoding +declaration (see section 4.3.3 of the XML 1.0 recommendation). + +This parameter has no effect when the application provides a character +stream. + +=back + +=back + +=head2 Locator + +Interface for associating a SAX event with a document location. + +If a SAX parser provides location information to the SAX application, +it does so by implementing the following methods and then calling the +`C' handler method. The handler can use the +object to obtain the location of any other document handler event in +the XML source document. + +Note that the results returned by the object will be valid only during +the scope of each document handler method: the application will +receive unpredictable results if it attempts to use the locator at any +other time. + +SAX parsers are not required to supply a locator, but they are very +strongly encouraged to do so. + +=over 4 + +=item location() + +Return the location information for the current event. + +Returns a hash containing the following parameters: + + ColumnNumber The column number, or undef if none is available. + LineNumber The line number, or undef if none is available. + PublicId A string containing the public identifier, or undef if + none is available. + SystemId A string containing the system identifier, or undef if + none is available. + +=back + +=head1 Handler Interfaces + +SAX handler methods are grouped into four interfaces: the document +handler for receiving normal document events, the DTD handler for +receiving notation and unparsed entity events, the error handler for +receiving errors and warnings, and the entity resolver for redirecting +external system identifiers. + +The application may choose to implement each interface in one package +or in seperate packages, as long as the objects provided as parameters +to the parser provide the matching interface. + +Parsers may implement additional methods in each of these categories, +refer to the parser documentation for further information. + +All handlers are called with a single hash argument containing the +parameters for that handler. + +Application writers who do not want to implement the entire interface +can leave those methods undefined. Events whose handler methods are +undefined will be ignored unless otherwise stated. + +=head2 DocumentHandler + +This is the main interface that most SAX applications implement: if +the application needs to be informed of basic parsing events, it +implements this interface and provides an instance with the SAX parser +using the `C' parameter. The parser uses the instance +to report basic document-related events like the start and end of +elements and character data. + +The order of events in this interface is very important, and mirrors +the order of information in the document itself. For example, all of +an element's content (character data, processing instructions, and/or +subelements) will appear, in order, between the `C' +event and the corresponding `C' event. + +The application can find the location of any event using the Locator +interface supplied by the Parser through the +`C' method. + +=over 4 + +=item set_document_locator( { Locator => $locator } ) + +Receive an object for locating the origin of SAX document events. + +SAX parsers are strongly encouraged (though not absolutely required) +to supply a locator: if it does so, it must supply the locator to the +application by invoking this method before invoking any of the other +methods in the DocumentHandler interface. + +The locator allows the application to determine the end position of +any document-related event, even if the parser is not reporting an +error. Typically, the application will use this information for +reporting its own errors (such as character content that does not +match an application's business rules). The information returned by +the locator is probably not sufficient for use with a search engine. + +Note that the locator will return correct information only during the +invocation of the events in this interface. The application should not +attempt to use it at any other time. + +Parameters: + + Locator An object that can return the location of any SAX document + event. + +=item start_document( { } ) + +Receive notification of the beginning of a document. + +The SAX parser will invoke this method only once, before any other +methods in this interface or in DTDHandler. + +=item end_document( { } ) + +Receive notification of the end of a document, no parameters are +passed for the end of a document. + +The SAX parser will invoke this method only once, and it will be the +last method invoked during the parse. The parser shall not invoke +this method until it has either abandoned parsing (because of an +unrecoverable error) or reached the end of input. + +The value returned by calling `C' will be the value +returned by `C'. + +=item start_element( { Name => $name, Attributes => $attributes } ) + +Receive notification of the beginning of an element. + +The Parser will invoke this method at the beginning of every element +in the XML document; there will be a corresponding `C' +event for every `C' event (even when the element is +empty). All of the element's content will be reported, in order, +before the corresponding `C' event. + +If the element name has a namespace prefix, the prefix will still be +attached. Note that the attribute list provided will contain only +attributes with explicit values (specified or defaulted): #IMPLIED +attributes will be omitted. + +Parameters: + + Name The element type name. + Attributes The attributes attached to the element, if any. + +=item end_element( { Name => $name } ) + +Receive notification of the end of an element. + +The SAX parser will invoke this method at the end of every element in +the XML document; there will be a corresponding `C' +event for every `C' event (even when the element is +empty). + +If the element name has a namespace prefix, the prefix will still be +attached to the name. + +Parameters: + + Name The element type name. + +=item characters( { Data => $characters } ) + +Receive notification of character data. + +The Parser will call this method to report each chunk of character +data. SAX parsers may return all contiguous character data in a +single chunk, or they may split it into several chunks; however, all +of the characters in any single event must come from the same external +entity, so that the Locator provides useful information. + +Note that some parsers will report whitespace using the +`C' method rather than this one (validating +parsers must do so). + +Parameters: + + Data The characters from the XML document. + +=item ignorable_whitespace( { Data => $whitespace } ) + +Receive notification of ignorable whitespace in element content. + +Validating Parsers must use this method to report each chunk of +ignorable whitespace (see the W3C XML 1.0 recommendation, section +2.10): non-validating parsers may also use this method if they are +capable of parsing and using content models. + +SAX parsers may return all contiguous whitespace in a single chunk, or +they may split it into several chunks; however, all of the characters +in any single event must come from the same external entity, so that +the Locator provides useful information. + +The application must not attempt to read from the array outside of the +specified range. + + Data The characters from the XML document. + +=item processing_instruction ( { Target => $target, Data => $data } ) + +Receive notification of a processing instruction. + +The Parser will invoke this method once for each processing +instruction found: note that processing instructions may occur before +or after the main document element. + +A SAX parser should never report an XML declaration (XML 1.0, section +2.8) or a text declaration (XML 1.0, section 4.3.1) using this method. + +Parameters: + + Target The processing instruction target. + Data The processing instruction data, if any. + +=back + +=head2 ErrorHandler + +Basic interface for SAX error handlers. + +If a SAX application needs to implement customized error handling, it +must implement this interface and then provide an instance to the SAX +parser using the parser's `C' parameter. The parser +will then report all errors and warnings through this interface. + +The parser shall use this interface instead of throwing an exception: +it is up to the application whether to throw an exception for +different types of errors and warnings. Note, however, that there is +no requirement that the parser continue to provide useful information +after a call to `C' (in other words, a SAX driver class +could catch an exception and report a fatalError). + +All error handlers receive the following I. The +`C', `C', `C', and `C' +are provided only if the parser has that information available. + + Messsage The error or warning message, or undef to use the message + from the `C' parameter + PublicId The public identifer of the entity that generated the + error or warning. + SystemId The system identifer of the entity that generated the + error or warning. + LineNumber The line number of the end of the text that caused the + error or warning. + ColumnNumber The column number of the end of the text that cause the + error or warning. + EvalError The error value returned from a lower level interface. + +Application writers who do not want to implement the entire interface +can leave those methods undefined. If not defined, calls to the +`C' and `C' handlers will be ignored and a +processing will be terminated (going straight to `C') +after the call to `C'. + +=over 4 + +=item warning( { I } ) + +Receive notification of a warning. + +SAX parsers will use this method to report conditions that are not +errors or fatal errors as defined by the XML 1.0 recommendation. The +default behaviour is to take no action. + +The SAX parser must continue to provide normal parsing events after +invoking this method: it should still be possible for the application +to process the document through to the end. + +=item error( { I } ) + +Receive notification of a recoverable error. + +This corresponds to the definition of "error" in section 1.2 of the +W3C XML 1.0 Recommendation. For example, a validating parser would use +this callback to report the violation of a validity constraint. The +default behaviour is to take no action. + +The SAX parser must continue to provide normal parsing events after +invoking this method: it should still be possible for the application +to process the document through to the end. If the application cannot +do so, then the parser should report a fatal error even if the XML 1.0 +recommendation does not require it to do so. + +=item fatal_error( { I } ) + +Receive notification of a non-recoverable error. + +This corresponds to the definition of "fatal error" in section 1.2 of +the W3C XML 1.0 Recommendation. For example, a parser would use this +callback to report the violation of a well-formedness constraint. + +The application must assume that the document is unusable after the +parser has invoked this method, and should continue (if at all) only +for the sake of collecting addition error messages: in fact, SAX +parsers are free to stop reporting any other events once this method +has been invoked. + +=back + +=head2 DTDHandler + +Receive notification of basic DTD-related events. + +If a SAX application needs information about notations and unparsed +entities, then the application implements this interface and provide +an instance to the SAX parser using the parser's `C' +parameter. The parser uses the instance to report notation and +unparsed entity declarations to the application. + +The SAX parser may report these events in any order, regardless of the +order in which the notations and unparsed entities were declared; +however, all DTD events must be reported after the document handler's +`C' event, and before the first `C' +event. + +It is up to the application to store the information for future use +(perhaps in a hash table or object tree). If the application +encounters attributes of type "NOTATION", "ENTITY", or "ENTITIES", it +can use the information that it obtained through this interface to +find the entity and/or notation corresponding with the attribute +value. + +Application writers who do not want to implement the entire interface +can leave those methods undefined. Events whose handler methods are +undefined will be ignored. + +=over 4 + +=item notation_decl( { I } ) + +Receive notification of a notation declaration event. + +It is up to the application to record the notation for later +reference, if necessary. + +If a system identifier is present, and it is a URL, the SAX parser +must resolve it fully before passing it to the application. + +I: + + Name The notation name. + PublicId The notation's public identifier, or undef if none was given. + SystemId The notation's system identifier, or undef if none was given. + +=item unparsed_entity_decl( { I } ) + +Receive notification of an unparsed entity declaration event. + +Note that the notation name corresponds to a notation reported by the +`C' event. It is up to the application to record the +entity for later reference, if necessary. + +If the system identifier is a URL, the parser must resolve it fully +before passing it to the application. + +I: + + Name The unparsed entity's name. + PublicId The entity's public identifier, or undef if none was given. + SystemId The entity's system identifier (it must always have one). + NotationName The name of the associated notation. + +=back + +=head2 EntityResolver + +Basic interface for resolving entities. + +If a SAX application needs to implement customized handling for +external entities, it must implement this interface and provide an +instance with the SAX parser using the parser's `C' +parameter. + +The parser will then allow the application to intercept any external +entities (including the external DTD subset and external parameter +entities, if any) before including them. + +Many SAX applications will not need to implement this interface, but +it will be especially useful for applications that build XML documents +from databases or other specialised input sources, or for applications +that use URI types other than URLs. + +The application can also use this interface to redirect system +identifiers to local URIs or to look up replacements in a catalog +(possibly by using the public identifier). + +=over 4 + +=item resolve_entity( { PublicId => $public_id, SystemId => $system_id } ) + +Allow the application to resolve external entities. + +The Parser will call this method before opening any external entity +except the top-level document entity (including the external DTD +subset, external entities referenced within the DTD, and external +entities referenced within the document element): the application may +request that the parser resolve the entity itself, that it use an +alternative URI, or that it use an entirely different input source. + +Application writers can use this method to redirect external system +identifiers to secure and/or local URIs, to look up public identifiers +in a catalogue, or to read an entity from a database or other input +source (including, for example, a dialog box). + +If the system identifier is a URL, the SAX parser must resolve it +fully before reporting it to the application. + +Parameters: + + PublicId The public identifier of the external entity being + referenced, or undef if none was supplied. + SystemId The system identifier of the external entity being + referenced. + +`C' returns undef to request that the parser open a +regular URI connection to the system identifier or returns a hash +containing the same parameters as the `C' parameter to +Parser's `C' method, summarized here: + + PublicId The public identifier of the external entity being + referenced, or undef if none was supplied. + SystemId The system identifier of the external entity being + referenced. + String String containing XML text + ByteStream An open file handle. + CharacterStream + An open file handle. + Encoding The character encoding, if known. + +See Parser's `C' method for complete details on how these +parameters interact. + +=back + +=head1 Contributors + +SAX was developed collaboratively by +the members of the XML-DEV mailing list. Please see the ``SAX History +and Contributors'' page for the people who did the real work behind +SAX. Much of the content of this document was copied from the SAX 1.0 +Java Implementation documentation. + +The SAX for Python specification was helpful in creating this +specification. + + +Thanks to the following people who contributed to Perl SAX. + + Eduard (Enno) Derksen + Ken MacLeod + Eric Prud'hommeaux + Larry Wall diff --git a/doc/UsingPatActModules.pod b/doc/UsingPatActModules.pod new file mode 100644 index 0000000..475dc32 --- /dev/null +++ b/doc/UsingPatActModules.pod @@ -0,0 +1,75 @@ +=head1 Using PatAct Modules + +This document is targeted towards people who want to write scripts or +modules that use pattern and action modules. If you want to create a +new pattern or action module, please see ``Creating PatAct Modules.'' + +You would want to use pattern/action modules if you want to apply a +complex set of patterns or queries against an XML instance and perform +actions associated with those patterns or queries. To be able to use +pattern/action modules you will need a pattern-matching module that +supports the format of the pattern or query language you can use and +an action module that will perform the types of actions you need to +perform. + +Available pattern-matching modules are: + + XML::PatAct:: + ::MatchName Simple element name, element hierarchy matching + +Available action modules are: + + XML::PatAct:: + ::ToObjects Convert XML instances into Perl objects + ::Amsterdam Simplistic style-sheet using before/after strings + +Using pattern/action modules involves loading the modules, creating a +pattern/action list, creating instances of the pattern and matching +modules, and then starting a parse using the matching module as a +handler: + + use XML::Parser::PerlSAX; + use XML::PatAct::MatchName; + use XML::PatAct::ToObjects; + + my $patterns = [ + 'schema' => [ qw{ -holder } ], + 'table' => [ qw{ -make Schema::Table } ], + 'name' => [ qw{ -field Name -as-string } ], + ]; + + my $matcher = XML::PatAct::MatchName->new( Patterns => $patterns ); + my $handler = XML::PatAct::ToObjects->new( Patterns => $patterns, + Matcher => $matcher); + + my $parser = XML::Parser::PerlSAX->new( Handler => $handler ); + my $schema = $parser->parse(Source => { SystemId => $ARGV[0] } ); + +The example above use the MatchName and ToObjects pattern and action +modules. The pattern list contains pairs of patterns and actions in +the format specified by MatchName and ToObjects, other modules will +use other formats. The patterns that MatchName supports are a simple +element name or a hierarchy of element names. The actions that +ToObjects support describe how to create Perl objects from the XML +instances. + +The $matcher object is an instance of XML::PatAct::MatchName. +$matcher is created and associated with the pattern/action list that +will be matched against. The $handler object is an instance of +XML::PatAct::ToObjects. $handler is created and associated with the +pattern/action list to be matched against as well as the pattern +matching instance $matcher. + +$handler is a PerlSAX event handler. XML::Parser::PerlSAX is used as +the source of XML events. Other PerlSAX event generators include +XML::Grove::PerlSAX and XML::ESISParser. $parser is created with the +$handler object as it's Handler. + +The `parse()' method of $parser is called to run the handler (the +matching object) to produce the output from XML::PatAct::ToObjects, +which is a Perl object converted from XML, $schema. + +The above example is an abbrieviated version. A complete example of +usage of the MatchName and ToObjects modules, including source XML, is +in the documentation for the XML::PatAct::ToObjects module. The +script and source XML are also in the examples directory. diff --git a/doc/UsingPerlSAX.pod b/doc/UsingPerlSAX.pod new file mode 100644 index 0000000..b0ac2ac --- /dev/null +++ b/doc/UsingPerlSAX.pod @@ -0,0 +1,87 @@ +=head1 Using PerlSAX + +Working with PerlSAX involves using two classes (packages), a PerlSAX +parser that generates parsing events and a class that you write that +will receive those parsing events, the ``handler''. This guide will +use the XML::Parser::PerlSAX parser that uses Clark Cooper's +XML::Parser module. + +The handler class implements the PerlSAX handler methods that you are +interested in. The following example, MyHandler.pm, prints a message +every time an element starts or ends: + + package MyHandler; + + sub new { + my ($type) = @_; + return bless {}, $type; + } + + sub start_element { + my ($self, $element) = @_; + + print "Start element: $element->{Name}\n"; + } + + sub end_element { + my ($self, $element) = @_; + + print "End element: $element->{Name}\n"; + } + + 1; + +To use your handler you will need to have a script, myhandler.pl, that +loads and creates your handler and the parser, and then calls the +parser to parse the XML instance and send events to your handler: + + use XML::Parser::PerlSAX; + use MyHandler; + + my $my_handler = MyHandler->new; + my $parser = XML::Parser::PerlSAX->new( Handler => $my_handler ); + + foreach my $instance (@ARGV) { + $parser->parse(Source => { SystemId => $instance }); + } + +Given this XML instance, myhandler.xml: + + + +
+ Using PerlSAX + Working with PerlSAX ... +
+ +Running myhandler.pl like this: + + perl myhandler.pl myhandler.xml + +will produce this output: + + Start element: article + Start element: title + End element: title + Start element: paragraph + End element: paragraph + End element: article + +=head2 For More Information + +PerlSAX.pod describes the PerlSAX interface. Each parser module +describes it's individual capabilities. XML::Parser::PerlSAX is the +most commonly used PerlSAX implementation. + +The files described in this doc are in the `examples' directory. A +more complete implementation of the very simple handler above is in +the module XML::Handler::Sample. Other, more complex handlers are in +the XML::Handler directory as well. + +Another hands-on doc for PerlSAX is the XML-Parser-and-PerlSAX.pod. +This doc describes the difference between and the purpose of PerlSAX +with respect to XML::Parser. + +This document was inspired by and uses the code examples from David +Megginson's ``Quick Start for SAX Application Writers.'' + diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..3f35885 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,307 @@ + + + libxml-perl + + + + + +
libxml-perl
+Current version is @VERSION@    
+
+libxml-perl is a collection of Perl modules, +scripts, and documents for working with XML in Perl. libxml-perl +software works in combination with XML::Parser, PerlSAX, XML::DOM, +XML::Grove, and others.
+ +

Questions about how to use this library should be directed to the +comp.lang.perl.modules USENET Newsgroup. Bug reports and +suggestions for improvements can be sent to the +<perl-xml@activestate.com> mailing list. This mailing list is +also the place for general discussions and development of the +libxml-perl package.

+ +

To join the Perl-XML mailing list, send an email message to +ListManager@ActiveState.com with the following text in the body: +

+    Subscribe Perl-XML
+

+ + + + + +
+ + +Source
+libxml-perl source is available on CPAN in the XML module +directory. This link goes through the CPAN redirector so if the +site gives you any problems, just click it again and you will be +redirected to a different site.

+ +libxml-perl source is also available here.

+ +The libxml-perl-@VERSION@ README

+ + +Modules
+ +The following modules are part of libxml-perl. Below they are marked with their release status:

+ + + + + +
    STABLEhas been in use for a while with few or no outstanding bugs
    BETAinterfaces are stable but there may still be bugs
    ALPHAinterfaces are changing, there may be lots of bugs, and there may not be docs available yet
+ +XML::Parser::PerlSAX
+    BETA
+XML::Parser::PerlSAX is a PerlSAX parser using XML::Parser (which uses James Clark's Expat XML Parser).

+ +XML::Handler::XMLWriter
+    BETA
+A PerlSAX handler for writing readable XML (in contrast to Canonical +XML, for example). XMLWriter is also subclassable and supports +calling start and end methods by element-names (subclassed from +XML::Handler::Subs). XMLWriter is similar to XML::Parser's Stream +style.

+ +XML::Handler::Subs
+    BETA
+A PerlSAX handler base class that calls start and end methods by +element-names. Subs is similar to XML::Parser's Subs style.

+ +XML::Handler::CanonXMLWriter
+    BETA
+XML::Handler::CanonXMLWriter is a PerlSAX handler that outputs in Canonical +XML.

+ +XML::Handler::Sample
+    BETA
+XML::Handler::Sample is a PerlSAX handler that simply prints out the +event names as they are parsed by a PerlSAX parser. It can be used for +debugging or as a template for building new handlers. +XML::Handler::Sample contains handlers for all known parser events.

+ +XML::ESISParser
+    BETA
+XML::ESISParser is a validating PerlSAX parser using James Clark's +`nsgmls' SGML/XML Parser. ESISParser supports both XML and SGML +document instances. Unless you need validation, you should probably +be using XML::Parser::PerlSAX or XML::Parser.

+ +XML::ESISParser with XML::Grove obsolete the SGML::SPGroveBuilder and SGML::Grove modules.

+ +Data::Grove, Data::Grove::Parent, Data::Grove::Visitor
+    BETA
+Data::Grove and it's helpers provide a base class for deeply nested or +directed graph structures. Used by XML::Grove (and others soon).

+ +XML::SAX2Perl, XML::Perl2SAX
+    ALPHA
+SAX2Perl and Perl2SAX are SAX Parser<->DocumentHandler filters. These +modules translate parse events between the Java/CORBA style SAX +methods and PerlSAX style methods.

+ +The following modules will very likely be renamed in the next release. + +XML::PatAct::MatchName
+    ALPHA
+MatchName is a pattern matching module that can be used with PatAct +action modules. MatchName uses simple element names or element name +lists to match names to actions.

+ +XML::PatAct::ToObjects
+    ALPHA
+ToObjects is a PatAct action module. ToObjects can be used to create +application-ready Perl objects from XML instances.

+ +XML::PatAct::Amsterdam
+    ALPHA
+Amsterdam is a PatAct action module. Amsterdam can be used to apply a +very simple form of style-sheet to an XML instance by using ``before'' +and ``after'' strings that are output before and after the contents of +elements.

+ +XML::PatAct::PatternTempl, XML::PatAct::ActionTempl
+    BETA
+PatternTempl and ActionTempl are template files that pattern/action +module writers can copy to create new modules. See Creating PatAct +Modules for more information.

+ + +Documents
+ +PerlSAX
+This document defines a Perl binding to SAX 1.0. PerlSAX-based parser +modules implement and possibly extend the interface described in +PerlSAX.

+ +Using PerlSAX
+UsingPerlSAX is a brief introduction to PerlSAX using the +XML::Parser::PerlSAX module.

+ +Using PatAct Modules
+Describes how to use pattern/action modules to transform XML +instances.

+ +Creating PatAct Modules
+A document for module writers who are writing new pattern/ action +modules.

+ +modules.xml
+modules.xml contains a listing of all Perl XML packages and their +public modules categorized by several topics.

+ +
+ +News
+ + +libxml-perl-0.07
+ +February 22, 2000
+ + + + + +
-doc/index.html: libxml-perl site index
-doc/mirror.sh: creates a libxml-perl mirror site
-Fixes: + + + +
-all modules: release script didn't +insert version numbers in Perl modules. Reported by Enno Derksen
-doc/modules.xml: well-formedness errors. Reported by KangChan Lee
+
+
+ + +libxml-perl-0.06
+ +February 4, 2000
+ + + + + + + + +
-all modules: add $VERSION. Suggested +by Michael Koehne
-XML::Parser::PerlSAX: add +UseAttributeOrder option and AttributeOrder and Defaulted properties +to start_element() handler. Suggested by Enno Derksen
-XML::Parser::PerlSAX: add start_cdata, +end_cdata, and entity_reference events
-XML::PatAct::Amsterdam: added Output +and AsString options, added support for replacing attributes
-Data::Grove: add a +Data::Grove::Characters class to act as a default grove object for +containing characters.
-Fixes: + + + + +
-XML::PatAct::ToObjects: removed +leftover debugging statement
-XML::ESISParser: report record end as +characters if no record_end() handler
-XML::Parser::PerlSAX: For attribute +list declarations, now correctly calls the attlist_decl() method and +passes the ElementName property, it used to call entity_decl() passing +EntityName. Reported by Enno +Derksen and Colin +Muller
+
+
+ + +libxml-perl-0.05
+ +August 16, 1999
+ + + + + + + + +
-Major update to PerlSAX specification + + + + + + + +
-Added an introduction
-Added a ``Deviations from the Java version'' section
-Re-added the `set_document_locator()' handler method
-Added arguments to method synopses
-Attributed most of the content to the SAX 1.0 JavaDoc
-Minor typos
+
-XML::Handler::XMLWriter: a new PerlSAX handler for writing readable XML (in contrast to Canonical XML)
-XML::Handler::Subs: a new PerlSAX handler base class for calling user-defined subs
-XML::PatAct::ToObjects: add CopyAttributes option, add -grove-contents action
-All PatAct modules can now take parameters as either a list of key, value pairs or a hash
-Fixes: + + + +
-XML::ESISParser wasn't testing handlers for what methods they support
-XML::Parser::PerlSAX wasn't forwarding XML::Parser Element events
+
+
+ + +libxml-perl-0.04
+ +August 11, 1999
+ + + + + +
-Added pattern/action modules for name matching, converting to objects, and applying simple styles -- XML::PatAct::MatchName, XML::PatAct::ToObjects, and XML::PatAct::Amsterdam.
-Added ``Using PatAct Modules'' and ``Creating PatActModules'' docs.
-XML::Parser::PerlSAX and XML::ESISParser were not passing a hash for `start_document()' and `end_document()' per spec.
+
+ + +libxml-perl-0.03
+ +May 26, 1999
+ + + +
-added XML::Handler::CanonXMLWriter and test
+
+ + +libxml-perl-0.02
+ +May 24, 1999
+ + + + + + + +
-renamed package from `libxml' to `libxml-perl'
-added doc/modules.xml
-added doc/UsingPerlSAX.pod and example files
-moved PerlSAX.pod and interface-style.pod to `doc/'
-renamed Data::Grove::Tied to Data::Grove::Parent
+
+ + +
+ + +Contributors
+The following have shared their code, documents, comments, and/or suggestions for libxml-perl:

+ +Clark Cooper
+Eduard (Enno) Derksen
+Michael Koehne
+KangChan Lee
+Ken MacLeod
+Colin Muller
+Eric Prud'hommeaux
+Larry Wall
+ + + diff --git a/doc/interface-style.pod b/doc/interface-style.pod new file mode 100644 index 0000000..aca1047 --- /dev/null +++ b/doc/interface-style.pod @@ -0,0 +1,79 @@ +=head1 Priorities + +Larry Wall suggests, ``In the absence of other considerations, I'd +encourage you to provide the cleanest interface from the user's +standpoint, and let the implementer worry about the details.'' + +=head1 Naming + +B + +All method names use lower-case, `C<_>' seperated names. + +B + +All method names match their Java counterparts. + +All options, parameters, and property names use mixed case names, with +an initial upper case character. This eliminates a certain amount of +potential confusion with reserved words, which, for the most part, are +lower case. + +The following words are abbreviated in method names and parameters: + + Declaration decl Decl + Reference ref Ref + Identifier id Id + +=head1 Object Instantiation and Options + +For creating new parser or handler objects, the `new' methods accept a +list of key-value pairs (C<=E>) or a hash containing the options. +The key names are derived from the SAX positional parameter names +(`C' and `C' in Parser's `C') or the name of +option setting methods (`C', `C', +`C', `C', and `C' in Parser). + +Callers may get and set options directly in the object, for example: + + $parser = SAX::Parser->new( Source => { ByteStream => $fh }, + DocumentHandler => $doc_handler ); + + $parser->{Locale} = 'el_GR.ISO-8859-7'; + +There are no set/get methods in the Perl SAX API. + +=head1 Handler Calls + +Handler calls all take hashes instead of positional parameters. Key +names are derived from SAX positional parameter names. This allows +parsers and filters to provide additional parameters if they can or +the user requests it. + +=head1 Extending Handler Interfaces + +Developers of event-generators can extend the handler interface as +they need to. Event-generators that use an extended interface should +accept generator options or use `C' to test whether a handler can +support their extended interface. + +For example, a C that wants to receive internal +entity events instead of having them resolved and passed in to the +`C' method would define a `C' method +and/or set a parser option to pass or not pass internal entity events. + +=head1 Helper Classes + +Perl SAX avoids helper classes (like SAXException and InputSource) +where those classes only hold information and have no behavior. In +those cases, simple hashes are used instead. + +B if these should be implemented anyway for +easier portability. + +=head1 Contributors + + Eduard (Enno) Derksen + Ken MacLeod + Eric Prud'hommeaux + Larry Wall diff --git a/doc/mirror.sh b/doc/mirror.sh new file mode 100644 index 0000000..124a489 --- /dev/null +++ b/doc/mirror.sh @@ -0,0 +1,69 @@ +#! /bin/sh +# +# NAME +# mirror -- update web page with a libxml-perl release +# +# SYNOPSIS +usage="mirror RELEASE DESTDIR" +# +# DESCRIPTION +# `mirror' creates a web mirror using a libxml-perl release tar +# file. +# +# `mirror' pulls files from the tar file to create the web page. +# `mirror' searches HTML files for the string @VERSION@ and +# replaces it with RELEASE. `mirror' searches for all *.pm and +# *.pod files and converts them to HTML. It also copies a few +# hardcoded files. +# +# `mirror' installs the web pages in DESTDIR. +# +# CAUTION: `mirror' removes the contents of DESTDIR before +# copying files to it. +# +# AUTHOR +# Ken MacLeod +# +# $Id: mirror.sh,v 1.2 2000/02/22 21:02:56 kmacleod Exp $ +# + +PWD_CMD="/bin/pwd" +SED="sed" +TR="/usr/bin/tr" + +if [ $# != 2 ]; then + echo "usage: $usage" + exit 1 +fi + +RELEASE="$1" +DESTDIR="$2" + +set -e +set -x + +rm -rf $DESTDIR +mkdir -p $DESTDIR + +cp libxml-perl-${RELEASE}.tar.gz $DESTDIR + +cd $DESTDIR + +tar xzvf libxml-perl-${RELEASE}.tar.gz + +for ii in libxml-perl-${RELEASE}/doc/*.html; do + $SED <$ii >`basename $ii` \ + -e "s/@VERSION@/$RELEASE/g" +done +for ii in `cd libxml-perl-${RELEASE}/doc; echo *.pod`; do + pod2html libxml-perl-${RELEASE}/doc/$ii >`basename $ii .pod`.html +done +for ii in `cd libxml-perl-${RELEASE}/lib; echo */*.pm */*/*.pm`; do + dstfile=`echo $ii | sed -e 's|/|::|g'` + pod2html libxml-perl-${RELEASE}/lib/$ii >`basename $dstfile .pm`.html +done + +mv libxml-perl-${RELEASE}/README libxml-perl-${RELEASE}.readme +mv libxml-perl-${RELEASE}/doc/modules.xml . + +rm -rf libxml-perl-${RELEASE} pod2html-dircache pod2html-itemcache diff --git a/doc/modules.xml b/doc/modules.xml new file mode 100644 index 0000000..d2e5026 --- /dev/null +++ b/doc/modules.xml @@ -0,0 +1,298 @@ + + + + + Apache-MimeXML + + Apache-MimeXML + XML + + + + + CGI-Formalware + + CGI::Formalware + XML + XML::Parser + + + + + CGI-XML + + CGI::XML + XML::Parser + XML Conversion + + + + + CGI-XMLForm + + CGI::XMLForm + XML::Parser + XML Conversion + + + + + DBIx-XML_RDB + + DBIx::XML_RDB + XML Conversion + + + + + libxml-perl + + Data::Grove + XML + + + Data::Grove::Visitor + Data::Grove + + + XML::ESISParser + PerlSAX + + + XML::Handler::CanonXMLWriter + PerlSAX + XML Conversion + + + XML::Handler::Sample + PerlSAX + XML Conversion + + + XML::Handler::Subs + PerlSAX + XML Conversion + + + XML::Handler::XMLWriter + PerlSAX + XML Conversion + + + XML::Parser::PerlSAX + PerlSAX + XML::Parser + + + XML::PatAct::ActionTempl + PatAct + PatAct Action + + + XML::PatAct::Amsterdam + PerlSAX + XML Conversion + PatAct + PatAct Action + + + XML::PatAct::MatchName + PatAct + PatAct Pattern + + + XML::PatAct::PatternTempl + PatAct + PatAct Pattern + + + XML::PatAct::ToObjects + PatAct + PatAct Action + XML Conversion + + + XML::Perl2SAX + PerlSAX + + + XML::SAX2Perl + PerlSAX + + + + + Frontier-RPC + + Frontier::RPC2 + XML::Parser + XML Conversion + + + Frontier::Client + + + Frontier::Daemon + + + + + XML-DOM + + XML::DOM + XML::Parser + XML Objects + XML Query + XML Conversion + + + XML::DOM::UTF8 + XML::DOM + + + + + XML-Dumper + + XML::Dumper + XML Conversion + + + + + XML-Edifact + + XML::Edifact + XML Conversion + + + + + XML-Encoding + + XML::Encoding + XML::Parser + + + + + XML-Generator + + XML::Generator + XML Writer + + + + + XML-Grove + + XML::Grove + XML Objects + Data::Grove + + + XML::Grove::AsCanonXML + XML::Grove + XML Writer + + + XML::Grove::AsString + XML::Grove + XML Conversion + + + XML::Grove::Builder + XML::Grove + PerlSAX + + + XML::Grove::IDs + XML::Grove + XML Query + + + XML::Grove::Path + XML::Grove + XML Query + + + XML::Grove::PerlSAX + XML::Grove + PerlSAX + + + XML::Grove::Sub + XML::Grove + + + XML::Grove::Subst + XML::Grove + + + + + XML-Parser + + XML::Parser + XML + + + XML::ParserDebug + XML::Parser + XML Conversion + + + XML::Parser::Objects + XML::Parser + XML Objects + + + XML::Parser::Stream + XML::Parser + XML Conversion + + + XML::Parser::Subs + XML::Parser + + + XML::Parser::Tree + XML::Parser + XML Objects + + + + + XML-QL + + XML::QL + XML::Parser + XML Query + + + + + XML-Registry + + XML::Registry + XML + + + + + XML-Writer + + XML::Writer + XML Writer + + + + + XML-XQL + + XML::XQL + XML::DOM + XML Query + + + diff --git a/doc/sax-2.0-adv.html b/doc/sax-2.0-adv.html new file mode 100644 index 0000000..b8fd491 --- /dev/null +++ b/doc/sax-2.0-adv.html @@ -0,0 +1,1005 @@ + + + + Advanced Features of the Perl SAX 2.0 Binding + + + + +

Advanced SAX

+ +

The classes, methods, and features described below are +not commonly used in most applications and can be ignored by most +users. If however you find that you are not getting the granularity +you expect from Basic SAX, this would be the place to look for more. +Advanced SAX isn't advanced in the sense that it is harder, or requires +better programming skills. It is simply more complete, and has been +separated to keep Basic SAX simple in terms of the number of events +one would have to deal with. +

+ +

SAX Parsers

+ +

SAX supports several classes of event handlers: content handlers, +declaration handlers, DTD handlers, error handlers, entity resolvers, +and other extensions. For each class of events, a seperate handler +can be used to handle those events. If a handler is not defined for a +class of events, then the default handler, Handler, is used. +Each of these handlers is described in the sections below. +Applications may change an event handler in the middle of the parse +and the SAX parser will begin using the new handler immediately.

+ +

SAX's basic interface defines methods for parsing system +identifiers (URIs), open files, and strings. Behind the scenes, +though, SAX uses a Source hash that contains that +information, plus encoding, system and public identifiers if +available. These are described below under the Source +option.

+ +

SAX parsers accept all features as options to the parse() +methods and on the parser's constructor. Features are described in +the next section.

+ +

+

parse(options)
+
+Parses the XML instance identified by the Source option. +options can be a list of option, value pairs or a hash. +parse() returns the result of calling the +end_document() handler.

+ +

+

ContentHandler
+
+Object to receive document content events. The +ContentHandler, with additional events defined below, is the +class of events described in Basic +SAX Handler.If the application does not register a content handler +or content event handlers on the default handler, content events +reported by the SAX parser will be silently ignored.

+ +

+

DTDHandler
+
+Object to receive basic DTD events. If the application does not +register a DTD handler or DTD event handlers on the default handler, +DTD events reported by the SAX parser will be silently +ignored.

+ +

+

EntityResolver
+
+Object to resolve external entities. If the application does not +register an entity resolver or entity events on the default handler, +the SAX parser will perform its own default resolution.

+ +

+

ErrorHandler
+
+Object to receive error-message events. If the application does not +register an error handler or error event handlers on the default +handler, all error events reported by the SAX parser will be silently +ignored; however, normal processing may not continue. It is highly +recommended that all SAX applications implement an error handler to +avoid unexpected bugs.

+ +

+

Source
+
+A hash containing information about the XML instance to be parsed. +See Input Sources below. Note that +Source cannot be changed during the parse

+ +

+

+
Features
+
+ A hash containing Feature information, as described below. + Features can be set at runtime but not directly on the Features + hash (at least, not reliably. You can do it, but the results + might not be what you expect as it doesn't give the parser a + chance to look at what you've set so that it can't react properly + to errors, or Features that it doesn't support). You should use + the set_feature() method instead. +
+
+

+ + +

Features

+ +

Features are as defined in SAX2: Features +and Properties, but not of course limited to those. You may add +your own Features. Also, Java has an artificial distinction between +Features and Properties which is unnecessary. In Perl, both have been +merged under the same name. +

+ +

Features can be passed as options when creating a parser or calling +a parse() method. They may also be set using the +set_feature(). +

+ +
+    $parser = AnySAXParser->new(
+                                Features => {
+                                             'http://xml.org/sax/features/namespaces' => 0,
+                                             },
+                                );
+    $parser->parse(
+                   Features => {
+                               'http://xml.org/sax/features/namespaces' => 0,
+                               },
+                   );
+    $parser->set_feature('http://xml.org/sax/properties/xml-string', 1);
+    $string = $parser->get_feature('http://xml.org/sax/properties/xml-string');
+
+ +

+ When performing namespace processing, Perl SAX parsers always provide + both the raw tag name in Name and the namespace names in + NamespaceURI, LocalName, and Prefix. + Therefore, the + "http://xml.org/sax/features/namespace-prefixes" Feature is + ignored. +

+ +

+ Also, Features are things that are supposed to be turned + on, and thus should normally be off by default, especially if + the parser doesn't support turning them off. Due to backwards + compatibility problems, the one exception to this rule is the + "http://xml.org/sax/features/namespaces" Feature which is on by + default and which a number of parsers may not be able to turn off. Thus, + a parser claiming to support this Feature (and all SAX2 parsers must + support it) may in fact only support turning it on. This is only a minor + problem as turning it off basically amounts to returning to SAX1, which + can be accomplished by a filter (eg XML::Filter::SAX2toSAX1). +

+ +

+ In addition to the Features described in the SAX spec + itself, a number of new ones may be defined for Perl. An example of + this would be http://xmlns.perl.org/sax/node-factory which + when supported by the parser would be settable to a NodeFactory object + that would be in charge of creating SAX nodes different from those that + are normally received by event handlers. See + http://xmlns.perl.org/ (currently + in alpha state) for details on how to register Features. +

+ +

+ The following methods are used to get and set features: +

+ +

+

get_feature(name)
+
+Look up the value of a feature. + +

The feature name is any fully-qualified URI. It is possible for an +SAX parser to recognize a feature name but to be unable to return its +value; this is especially true in the case of an adapter for a SAX1 +Parser, which has no way of knowing whether the underlying parser is +validating, for example.

+ +

Some feature values may be available only in specific contexts, +such as before, during, or after a parse.

+ +get_feature() returns the value of the feature, which is usually +either a boolean or an object, and will throw +XML::SAX::Exception::NotRecognized when the SAX parser does not +recognize the feature name and XML::SAX::Exception::NotSupported +when the SAX parser recognizes the feature name but cannot determine its +value at this time.

+ +

+

set_feature(name, +value)
+
+Set the state of a feature. + +

The feature name is any fully-qualified URI. It is possible for an +SAX parser to recognize a feature name but to be unable to set its +value; this is especially true in the case of an adapter for a SAX1 +Parser, which has no way of affecting whether the underlying parser is +validating, for example.

+ +

Some feature values may be immutable or mutable only in specific +contexts, such as before, during, or after a parse.

+ +set_feature() will throw XML::SAX::Exception::NotRecognized +when the SAX parser does not recognize the feature name and +XML::SAX::Exception::NotSupported when the SAX parser recognizes the +feature name but cannot set the requested value. + +

+ This method is also the standard mechanism for setting extended handlers, + such as "http://xml.org/sax/handlers/DeclHandler". +

+

+ + +

+

+
get_features()
+
+ Look up all Features that this parser claims to support. +

+ This method returns a hash of Features which the parser + claims to support. The value of the hash is currently + unspecified though it may be used later. This method is meant + to be inherited so that Features supported by the base parser + class (XML::SAX::Base) are declared to be supported by + subclasses. +

+

+ Calling this method is probably only moderately useful to end + users. It is mostly meant for use by XML::SAX, so that it can + query parsers for Feature support and return an appropriate + parser depending on the Features that are required. +

+
+
+

+ + + +

Input Sources

+ +

Input sources may be provided to parser objects or are returned by +entity resolvers. An input source is a hash with these +properties:

+ +
+
PublicId
+
The public identifier of this input source. + +

The public identifier is always optional: if the application writer +includes one, it will be provided as part of the location +information.

+ +
SystemId
+
The system identifier (URI) of this input source. + +

The system identifier is optional if there is a byte stream or a +character stream, but it is still useful to provide one, since the +application can use it to resolve relative URIs and can include it in +error messages and warnings (the parser will attempt to open a +connection to the URI only if there is no byte stream or character +stream specified).

+ +If the application knows the character encoding of the object +pointed to by the system identifier, it can register the encoding +using the Encoding property.
+ +
ByteStream
+
The byte stream for this input source. + +

The SAX parser will ignore this if there is also a character stream +specified, but it will use a byte stream in preference to opening a +URI connection itself.

+ +If the application knows the character encoding of the byte stream, it +should set the Encoding property.
+ +
CharacterStream
+
The character stream for this input source. + +

If there is a character stream specified, the SAX parser will +ignore any byte stream and will not attempt to open a URI connection +to the system identifier.

+ +

Note: A CharacterStream is a filehandle that does not need any encoding +translation done on it. This is implemented as a regular filehandle +and only works under Perl 5.7.2 or higher using PerlIO. To get a single +character, or number of characters from it, use the perl core read() +function. To get a single byte from it (or number of bytes), you can +use sysread(). The encoding of the stream should be in the Encoding +entry for the Source.

+ +
+ +
Encoding
+
The character encoding, if known. + +

The encoding must be a string acceptable for an XML encoding +declaration (see section 4.3.3 of the XML 1.0 recommendation).

+ +This property has no effect when the application provides a character +stream.
+
+ +

SAX Handlers

+ +

SAX supports several classes of event handlers: content handlers, +declaration handlers, DTD handlers, error handlers, entity resolvers, +and other extensions. This section defines each of these classes of +events.

+ +

Content Events

+ +

This is the main interface that most SAX applications implement: if +the application needs to be informed of basic parsing events, it +implements this interface and registers an instance with the SAX +parser using the ContentHandler property. The parser uses +the instance to report basic document-related events like the start +and end of elements and character data.

+ +

The order of events in this interface is very important, and +mirrors the order of information in the document itself. For example, +all of an element's content (character data, processing instructions, +and/or subelements) will appear, in order, between the +start_element event and the corresponding +end_element event.

+ + +

+

set_document_locator(locator)
+
+Receive an object for locating the origin of SAX document events. + +

SAX parsers are strongly encouraged (though not absolutely +required) to supply a locator: if it does so, it must supply the +locator to the application by invoking this method before invoking any +of the other methods in the ContentHandler interface.

+ +

The locator allows the application to determine the end position of +any document-related event, even if the parser is not reporting an +error. Typically, the application will use this information for +reporting its own errors (such as character content that does not +match an application's business rules). The information provided by +the locator is probably not sufficient for use with a search +engine.

+ +

Note that the locator will provide correct information only during +the invocation of the events in this interface. The application should +not attempt to use it at any other time.

+ +

The locator is a hash with these properties:

+ +
+ + + + + + + + + +
ColumnNumberThe column number of the end of the text where the exception +occurred.
LineNumberThe line number of the end of the text where the exception +occurred.
PublicIdThe public identifier of the entity where the exception +occurred.
SystemIdThe system identifier of the entity where the exception +occurred.
+
+

+ +

+

start_prefix_mapping(mapping)
+
+Begin the scope of a prefix-URI Namespace mapping. + +

The information from this event is not necessary for normal +Namespace processing: the SAX XML reader will automatically replace +prefixes for element and attribute names when the +"http://xml.org/sax/features/namespaces" feature is true (the +default).

+ +

There are cases, however, when applications need to use prefixes in +character data or in attribute values, where they cannot safely be +expanded automatically; the start/end_prefix_mapping event supplies the +information to the application to expand prefixes in those contexts +itself, if necessary.

+ +

Note that start/end_prefix_mapping() events are +not guaranteed to be properly nested relative to each-other: all +start_prefix_apping() events will occur before the +corresponding start_element() event, and all +end_prefix_mapping events will occur after the corresponding +end_element() event, but their order is not +guaranteed. +

+ +

mapping is a hash with these properties:

+ +
+ + + + + +
PrefixThe Namespace prefix being declared.
NamespaceURIThe Namespace URI the prefix is mapped to.
+
+

+ +

+

end_prefix_mapping(mapping)
+
+End the scope of a prefix-URI mapping. + +

See start_prefix_mapping() for details. This event will +always occur after the corresponding end_element event, but +the order of end_prefix_mapping events is not otherwise +guaranteed.

+ +

mapping is a hash with this property:

+ +
+ + + +
PrefixThe Namespace prefix that was being mapped.
+
+

+ +

+

processing_instruction(pi)
+
+Receive notification of a processing instruction. + +

The Parser will invoke this method once for each processing +instruction found: note that processing instructions may occur before +or after the main document element.

+ +

A SAX parser should never report an XML declaration (XML 1.0, +section 2.8) or a text declaration (XML 1.0, section 4.3.1) using this +method.

+ +

pi is a hash with these properties:

+ +
+ + + + + +
TargetThe processing instruction target.
DataThe processing instruction data, or null if none was +supplied.
+
+

+ +

+

skipped_entity(entity)
+
+Receive notification of a skipped entity. + +

The Parser will invoke this method once for each entity skipped. +Non-validating processors may skip entities if they have not seen the +declarations (because, for example, the entity was declared in an +external DTD subset). All processors may skip external entities, +depending on the values of the +"http://xml.org/sax/features/external-general-entities" and the +"http://xml.org/sax/features/external-parameter-entities" +Features.

+ +

entity is a hash with these properties:

+ +
+ + + +
NameThe name of the skipped entity. If it is a parameter +entity, the name will begin with '%'.
+
+

+ +

Declaration Events

+ +

This is an optional extension handler for SAX2 to provide +information about DTD declarations in an XML document. XML readers are +not required to support this handler.

+ +

Note that data-related DTD declarations (unparsed entities and +notations) are already reported through the DTDHandler interface.

+ +

If you are using the declaration handler together with a lexical +handler, all of the events will occur between the start_dtd +and the end_dtd events.

+ +

To set a seperate DeclHandler for an XML reader, set the +"http://xml.org/sax/handlers/DeclHandler" Feature with the +object to received declaration events. If the reader does not support +declaration events, it will throw a XML::SAX::Exception::NotRecognized +or a XML::SAX::Exception::NotSupported when you attempt to register +the handler. Declaration event handlers on the default handler are +automatically recognized and used.

+ + +

+

element_decl(element)
+
+Report an element type declaration. + +

The content model will consist of the string "EMPTY", the string +"ANY", or a parenthesised group, optionally followed by an occurrence +indicator. The model will be normalized so that all whitespace is +removed, and will include the enclosing parentheses.

+ +

element is a hash with these properties:

+ +
+ + + + + +
NameThe element type name.
ModelThe content model as a normalized string.
+
+

+ +

+

attribute_decl(attribute)
+
+Report an attribute type declaration. + +

Only the effective (first) declaration for an attribute will be +reported. The type will be one of the strings "CDATA", +"ID", "IDREF", "IDREFS", +"NMTOKEN", "NMTOKENS", "ENTITY", +"ENTITIES", or "NOTATION", or a parenthesized token +group with the separator "|" and all whitespace removed.

+ +

attribute is a hash with these properties:

+ +
+ + + + + + + + + + + +
eNameThe name of the associated element.
aNameThe name of the attribute.
TypeA string representing the attribute type.
ValueDefaultA string representing the attribute default ("#IMPLIED", +"#REQUIRED", or "#FIXED") or undef if none of these +applies.
ValueA string representing the attribute's default value, or null if +there is none.
+
+

+ +

+

internal_entity_decl(entity)
+
+Report an internal entity declaration. + +

Only the effective (first) declaration for each entity will be +reported.

+ +

entity is a hash with these properties:

+ +
+ + + + + +
NameThe name of the entity. If it is a parameter entity, the name will +begin with '%'.
ValueThe replacement text of the entity.
+
+

+ +

+

external_entity_decl(entity)
+
+Report a parsed external entity declaration. + +

Only the effective (first) declaration for each entity will be +reported.

+ +

entity is a hash with these properties:

+ +
+ + + + + + + +
NameThe name of the entity. If it is a parameter entity, the name will +begin with '%'.
PublicIdThe public identifier of the entity, or undef if none was +declared.
SystemIdThe system identifier of the entity.
+
+

+ +

DTD Events

+ +

If a SAX application needs information about notations and unparsed +entities, then the application implements this interface. The parser +uses the instance to report notation and unparsed entity declarations +to the application.

+ +

The SAX parser may report these events in any order, regardless of +the order in which the notations and unparsed entities were declared; +however, all DTD events must be reported after the document handler's +start_document() event, and before the first +start_element() event.

+ +

It is up to the application to store the information for future use +(perhaps in a hash table or object tree). If the application +encounters attributes of type "NOTATION", "ENTITY", +or "ENTITIES", it can use the information that it obtained +through this interface to find the entity and/or notation +corresponding with the attribute value.

+ +

+

notation_decl(notation)
+
+Receive notification of a notation declaration event. + +

It is up to the application to record the notation for later +reference, if necessary.

+ +

If a system identifier is present, and it is a URL, the SAX parser +must resolve it fully before passing it to the application.

+ +

notation is a hash with these properties:

+ +
+ + + + + + + +
NameThe notation name.
PublicIdThe public identifier of the entity, or undef if none was +declared.
SystemIdThe system identifier of the entity, or undef if none was +declared.
+
+

+ +

+

unparsed_entity_decl(entity)
+
+Receive notification of an unparsed entity declaration event. + +

Note that the notation name corresponds to a notation reported by +the notation_decl() event. It is up to the application to +record the entity for later reference, if necessary.

+ +

If the system identifier is a URL, the parser must resolve it fully +before passing it to the application.

+ +

entity is a hash with these properties:

+ +
+ + + + + + + + + +
NameThe unparsed entity's name.
PublicIdThe public identifier of the entity, or undef if none was +declared.
SystemIdThe system identifier of the entity.
NotationThe name of the associated notation.
+
+

+ +

Entity Resolver

+ +

If a SAX application needs to implement customized handling for +external entities, it must implement this interface.

+ +

The parser will then allow the application to intercept any +external entities (including the external DTD subset and external +parameter entities, if any) before including them.

+ +

+ Many SAX applications will not need to implement this interface, + but it will be especially useful for applications that build XML + documents from databases or other specialised input sources, or for + applications that use URI types that are either not URLs, or that + have schemes unknown to the parser. +

+ +

+

resolve_entity(entity)
+
+Allow the application to resolve external entities. + +

The Parser will call this method before opening any external entity +except the top-level document entity (including the external DTD +subset, external entities referenced within the DTD, and external +entities referenced within the document element): the application may +request that the parser resolve the entity itself, that it use an +alternative URI, or that it use an entirely different input +source.

+ +

Application writers can use this method to redirect external system +identifiers to secure and/or local URIs, to look up public identifiers +in a catalogue, or to read an entity from a database or other input +source (including, for example, a dialog box).

+ +

If the system identifier is a URL, the SAX parser must resolve it +fully before reporting it to the application.

+ +

entity is a hash with these properties:

+ +
+ + + + + +
PublicIdThe public identifier of the entity being referenced, or +undef if none was declared.
SystemIdThe system identifier of the entity being referenced.
+
+

+ +

Error Events

+ +

If a SAX application needs to implement customized error handling, +it must implement this interface. The parser will then report all +errors and warnings through this interface.

+ +

The parser shall use this interface to report errors instead or in +addition to throwing an exception: for errors and warnings the recommended +approach is to leave the application throw its own exceptions and to not +throw them in the parser. For fatal errors however, it is not uncommon that +the parser will throw an exception after having reported the error as it +renders any continuation of parsing impossible. +

+ +

All error handlers receive a hash, exception, with the +properties defined in Exceptions.

+ +

+

warning(exception)
+
+Receive notification of a warning. + +

SAX parsers will use this method to report conditions that are not +errors or fatal errors as defined by the XML 1.0 recommendation. The +default behaviour is to take no action.

+ +The SAX parser must continue to provide normal parsing events after +invoking this method: it should still be possible for the application +to process the document through to the end.

+ +

+

error(exception)
+
+Receive notification of a recoverable error. + +

This corresponds to the definition of "error" in section 1.2 of the +W3C XML 1.0 Recommendation. For example, a validating parser would use +this callback to report the violation of a validity constraint. The +default behaviour is to take no action.

+ +The SAX parser must continue to provide normal parsing events after +invoking this method: it should still be possible for the application +to process the document through to the end. If the application cannot +do so, then the parser should report a fatal error even if the XML 1.0 +recommendation does not require it to do so.

+ +

+

fatal_error(exception)
+
+Receive notification of a non-recoverable error. + +

This corresponds to the definition of "fatal error" in section 1.2 +of the W3C XML 1.0 Recommendation. For example, a parser would use +this callback to report the violation of a well-formedness +constraint.

+ +The application must assume that the document is unusable after the +parser has invoked this method, and should continue (if at all) only +for the sake of collecting addition error messages: in fact, SAX +parsers are free to stop reporting any other events once this method +has been invoked.

+ +

Lexical Events

+ +

This is an optional extension handler for SAX2 to provide lexical +information about an XML document, such as comments and CDATA section +boundaries; XML readers are not required to support this handler.

+ +

The events in the lexical handler apply to the entire document, not +just to the document element, and all lexical handler events must +appear between the content handler's start_document() and +end_document() events.

+ +

To set the LexicalHandler for an XML reader, set the Feature +"http://xml.org/sax/handlers/LexicalHandler" on the parser to +the object to receive lexical events. If the reader does not support +lexical events, it will throw a XML::SAX::Exception::NotRecognized or +a XML::SAX::Exception::NotSupported when you attempt to register the +handler.

+ +

+

start_dtd(dtd)
+
+Report the start of DTD declarations, if any. + +

Any declarations are assumed to be in the internal subset unless +otherwise indicated by a start_entity event.

+ +

Note that the start/end_dtd() events will appear +within the start/end_document() events from Content +Handler and before the first start_element() event.

+ +

dtd is a hash with these properties:

+ +
+ + + + + + + +
NameThe document type name.
PublicIdThe declared public identifier for the external DTD subset, or +undef if none was declared.
SystemIdThe declared system identifier for the external DTD subset, or +undef if none was declared.
+
+

+ +

+

end_dtd(dtd)
+
+Report the end of DTD declarations. + +

No properties are defined for this event (dtd is +empty).

+ +

+

start_entity(entity)
+
+Report the beginning of an entity in content. + +

NOTE: entity references in attribute values -- and the start +and end of the document entity -- are never reported.

+ +

The start and end of the external DTD subset are reported using the +pseudo-name "[dtd]". All other events must be properly nested within +start/end entity events.

+ +

Note that skipped entities will be reported through the +skipped_entity() event, which is part of the ContentHandler +interface.

+ +

entity is a hash with these properties:

+ +
+ + + +
NameThe name of the entity. If it is a parameter entity, the +name will begin with '%'.
+
+

+ +

+

end_entity(entity)
+
+Report the end of an entity. + +

entity is a hash with these properties:

+ +
+ + + +
NameThe name of the entity that is ending.
+
+

+ +

+

start_cdata(cdata)
+
+Report the start of a CDATA section. + +

The contents of the CDATA section will be reported through the +regular characters event.

+ +

No properties are defined for this event (cdata is +empty).

+ +

+

end_cdata(cdata)
+
+Report the end of a CDATA section. + +

No properties are defined for this event (cdata is +empty).

+ +

+

comment(comment)
+
+Report an XML comment anywhere in the document. + +

This callback will be used for comments inside or outside the +document element, including comments in the external DTD subset (if +read).

+ +

comment is a hash with these properties:

+ +
+ + + +
DataThe comment characters.
+
+

+ +

SAX Filters

+ +

An XML filter is like an XML event generator, except that it +obtains its events from another XML event generator rather than a +primary source like an XML document or database. Filters can modify a +stream of events as they pass on to the final application.

+ +

+

Parent
+
+The parent reader. + +

This Feature allows the application to link the filter to a parent +event generator (which may be another filter).

+ +

+ See the XML::SAX::Base module for more on filters. It is meant to be + used as a base class for filters and drivers, and makes them much + easier to implement. +

+ +

Java Compatibility

+ +The Perl SAX 2.0 binding differs from the Java binding in these ways: + +
    + +
  • Takes parameters to new(), to parse(), and to be +set directly in the object, instead of requiring set/get calls (see +below).
  • + +
  • Allows a default Handler parameter to be used for all +handlers.
  • + +
  • + No base classes are enforced. Instead, parsers dynamically + check the handlers for what methods they support. Note however that + using XML::SAX::Base as your base class for Drivers and Filters will + make your code a lot simpler, less error prone, and probably much more + correct with regard to this spec. Only reimplement that functionality + if you really need to. +
  • + +
  • The Attribute, InputSource, and SAXException (XML::SAX::Exception) +classes are only described as hashes (see below).
  • + +
  • Handlers are passed a hash (Node) containing properties as an +argument instead of positional arguments.
  • + +
  • parse() methods return the value returned by calling the +end_document() handler.
  • + +
  • + Method names have been converted to lower-case with underscores. + Parameters are all mixed case with initial upper-case. +
  • +
+ +

+ If compatibility is a problem for you consider writing a Filter that + converts from this style to the one you want. It is likely that such + a Filter will be available from CPAN in the not distant future. +

+ + + + + diff --git a/doc/sax-2.0.html b/doc/sax-2.0.html new file mode 100644 index 0000000..4f3c284 --- /dev/null +++ b/doc/sax-2.0.html @@ -0,0 +1,365 @@ + + + + Perl SAX 2.0 Binding + + + +

Perl SAX 2.0 Binding

+ +

SAX (Simple API for XML) is a common parser interface for XML +parsers. It allows application writers to write applications that use +XML parsers, but are independent of which parser is actually used.

+ +

This document describes the version of SAX used by Perl modules. +The original version of SAX 2.0, for Java, is described at http://sax.sourceforge.net/.

+ +

There are two basic interfaces in the Perl version of SAX, the +parser interface and the handler interface. The parser interface +creates new parser instances, starts parsing, and provides additional +information to handlers on request. The handler interface is used to +receive parse events from the parser. This pattern is also commonly +called "Producer and Consumer" or "Generator and Sink". Note that the +parser doesn't have to be an XML parser, all it needs to do is provide +a stream of events to the handler as if it were parsing XML. But the +actual data from which the events are generated can be anything, a Perl +object, a CSV file, a database table... +

+ +

SAX is typically used like this: + +

+    my $handler = MyHandler->new();
+    my $parser = AnySAXParser->new( Handler => $handler );
+    $parser->parse($uri);
+

+ +

Handlers are typically written like this: + +

+    package MyHandler;
+
+    sub new {
+        my $type = shift;
+        return bless {}, $type;
+    }
+
+    sub start_element {
+        my ($self, $element) = @_;
+
+        print "Starting element $element->{Name}\n";
+    }
+
+    sub end_element {
+        my ($self, $element) = @_;
+
+        print "Ending element $element->{Name}\n";
+    }
+
+    sub characters {
+        my ($self, $characters) = @_;
+
+        print "characters: $characters->{Data}\n";
+    }
+
+    1;
+

+ +

Basic SAX Parser

+ +

These methods and options are the most commonly used with SAX +parsers and event generators.

+ +

Applications may not invoke a parse() method again while a +parse is in progress (they should create a new SAX parser instead for +each nested XML document). Once a parse is complete, an application +may reuse the same parser object, possibly with a different input +source.

+ +

During the parse, the parser will provide information about the XML +document through the registered event handlers. Note that an event that +hasn't been registered (ie that doesn't have its corresponding method in +the handler's class) will not be called. This allows one to only +get the events one is interested in. +

+ +

+

parse(uri [, options])
+
+Parses the XML instance identified by uri (a system +identifier). options can be a list of option, value pairs +or a hash. Options include Handler, features and properties, +and advanced SAX parser options. parse() returns the result +of calling the end_document() handler. The options supported +by parse() may vary slightly if what is being "parsed" isn't +XML. +

+ +

+

parse_file(stream [, options])
+
+Parses the XML instance in the already opened stream, an +IO::Handler or similar. options are the same as for parse(). parse_file() returns the result +of calling the end_document() handler.

+ +

+

parse_string(string [, options])
+
+Parses the XML instance in string. options are +the same as for parse(). +parse_string() returns the result of calling the +end_document() handler.

+ +

+

Handler
+
+The default handler object to receive all events from the parser. +Applications may change Handler in the middle of the parse +and the SAX parser will begin using the new handler +immediately. The Advanced SAX document +lists a number of more specialized handlers that can be used should you +wish to dispatch different types of events to different objects. +

+ +

Basic SAX Handler

+ +

These methods are the most commonly used by SAX handlers.

+ +

+

start_document(document)
+
+Receive notification of the beginning of a document. + +

The SAX parser will invoke this method only once, before any other +methods (except for set_document_locator() in advanced SAX +handlers).

+ +No properties are defined for this event (document is +empty).

+ +

+

end_document(document)
+
+Receive notification of the end of a document. + +

The SAX parser will invoke this method only once, and it will be +the last method invoked during the parse. The parser shall not invoke +this method until it has either abandoned parsing (because of an +unrecoverable error) or reached the end of input.

+ +

No properties are defined for this event (document is +empty).

+ +The return value of end_document() is returned by the +parser's parse() methods.

+ +

+

start_element(element)
+
+Receive notification of the start of an element. + +

The Parser will invoke this method at the beginning of every +element in the XML document; there will be a corresponding +end_element() event for every start_element() event (even when the +element is empty). All of the element's content will be reported, in +order, before the corresponding end_element() event.

+ +element is a hash with these properties: + +
+ + + + + +
NameThe element type name (including prefix).
AttributesThe attributes attached to the element, if any.
+
+ +If namespace processing is turned on (which is the default), these +properties are also available: + +
+ + + + + + + +
NamespaceURIThe namespace of this element.
PrefixThe namespace prefix used on this element.
LocalNameThe local name of this element.
+
+ +Attributes is a hash keyed by JClark namespace notation. That +is, the keys are of the form "{NamespaceURI}LocalName". If the attribute +has no NamespaceURI, then it is simply "{}LocalName". Each attribute is +a hash with these properties: + +
+ + + + + + + + + + + +
NameThe attribute name (including prefix).
ValueThe normalized value of the attribute.
NamespaceURIThe namespace of this attribute.
PrefixThe namespace prefix used on this attribute.
LocalNameThe local name of this attribute.
+
+
+
+

+ +

+

end_element(element)
+
+Receive notification of the end of an element. + +

The SAX parser will invoke this method at the end of every element +in the XML document; there will be a corresponding start_element() event for every end_element() event (even when the element is +empty).

+ +element is a hash with these properties: + +
+ + + +
NameThe element type name (including prefix).
+
+ +If namespace processing is turned on (which is the default), these +properties are also available: + +
+ + + + + + + +
NamespaceURIThe namespace of this element.
PrefixThe namespace prefix used on this element.
LocalNameThe local name of this element.
+
+

+ +

+

characters(characters)
+
+Receive notification of character data. + +

The Parser will call this method to report each chunk of character +data. SAX parsers may return all contiguous character data in a +single chunk, or they may split it into several chunks (however, all +of the characters in any single event must come from the same external +entity so that the Locator provides useful information).

+ +

characters is a hash with this property:

+ +
+ + + +
DataThe characters from the XML document.
+
+

+ +

+

ignorable_whitespace(characters)
+
+Receive notification of ignorable whitespace in element content. + +

Validating Parsers must use this method to report each chunk of +ignorable whitespace (see the W3C XML 1.0 recommendation, section +2.10): non-validating parsers may also use this method if they are +capable of parsing and using content models.

+ +

SAX parsers may return all contiguous whitespace in a single chunk, +or they may split it into several chunks; however, all of the +characters in any single event must come from the same external +entity, so that the Locator provides useful information.

+ +

characters is a hash with this property:

+ +
+ + + +
DataThe whitespace characters from the XML document.
+
+

+ +

Exceptions

+ +

+ Conformant XML parsers are required to abort processing when + well-formedness or validation errors occur. In Perl, SAX parsers use + die() to signal these errors. To catch these errors and prevent + them from killing your program, use eval{}: + +

+    eval { $parser->parse($uri) };
+    if ($@) {
+        # handle error
+    }
+
+

+ +

+Exceptions can also be thrown when setting features or properties +on the SAX parser (see advanced SAX below).

+ +

+ Exception values ($@) in SAX are hashes blessed into the + package that defines their type, and have the following properties: +

+ +
+ + + + + +
MessageA detail message for this exception.
ExceptionThe embedded exception, or undef if there is none.
+
+ +If the exception is raised due to parse errors, these +properties are also available: + +
+ + + + + + + + + +
ColumnNumberThe column number of the end of the text where the exception +occurred.
LineNumberThe line number of the end of the text where the exception +occurred.
PublicIdThe public identifier of the entity where the exception +occurred.
SystemIdThe system identifier of the entity where the exception +occurred.
+
+ + +


+

Advanced SAX

+ + + + diff --git a/examples/MyHandler.pm b/examples/MyHandler.pm new file mode 100644 index 0000000..56b9904 --- /dev/null +++ b/examples/MyHandler.pm @@ -0,0 +1,22 @@ +# This is the example module in doc/UsingPerlSAX.pod + +package MyHandler; + +sub new { + my ($type) = @_; + return bless {}, $type; +} + +sub start_element { + my ($self, $element) = @_; + + print "Start element: $element->{Name}\n"; +} + +sub end_element { + my ($self, $element) = @_; + + print "End element: $element->{Name}\n"; +} + +1; diff --git a/examples/esis-test.pl b/examples/esis-test.pl new file mode 100644 index 0000000..6d09a31 --- /dev/null +++ b/examples/esis-test.pl @@ -0,0 +1,19 @@ +use XML::ESISParser; +use XML::Handler::Sample; + +if ($ARGV[0] eq '--sgml') { + push (@additional_args, IsSGML => 1); + shift @ARGV; +} + +if ($#ARGV != 0) { + die "usage: esis-test FILE\n"; +} +$file = shift @ARGV; + +$my_handler = XML::Handler::Sample->new; + +XML::ESISParser->new->parse(Source => { SystemId => $file }, + Handler => $my_handler, + @additional_args); + diff --git a/examples/myhandler.pl b/examples/myhandler.pl new file mode 100644 index 0000000..6c9c3d1 --- /dev/null +++ b/examples/myhandler.pl @@ -0,0 +1,11 @@ +# This is the example script in doc/UsingPerlSAX.pod + +use XML::Parser::PerlSAX; +use MyHandler; + +my $my_handler = MyHandler->new; +my $parser = XML::Parser::PerlSAX->new( Handler => $my_handler ); + +foreach my $instance (@ARGV) { + $parser->parse(Source => { SystemId => $instance }); +} diff --git a/examples/myhandler.xml b/examples/myhandler.xml new file mode 100644 index 0000000..7c0319a --- /dev/null +++ b/examples/myhandler.xml @@ -0,0 +1,6 @@ + + +
+Using PerlSAX +Working with PerlSAX ... +
diff --git a/examples/perlsax-test.pl b/examples/perlsax-test.pl new file mode 100644 index 0000000..d57068c --- /dev/null +++ b/examples/perlsax-test.pl @@ -0,0 +1,13 @@ +use XML::Parser::PerlSAX; +use XML::Handler::Sample; + +if ($#ARGV != 0) { + die "usage: esis-test FILE\n"; +} +$file = shift @ARGV; + +$my_handler = XML::Handler::Sample->new; + +XML::Parser::PerlSAX->new->parse(Source => { SystemId => $file }, + Handler => $my_handler); + diff --git a/examples/schema.pl b/examples/schema.pl new file mode 100644 index 0000000..dbc5c8a --- /dev/null +++ b/examples/schema.pl @@ -0,0 +1,36 @@ +# This template file is in the Public Domain. +# You may do anything you want with this file. +# +# $Id: schema.pl,v 1.1 1999/08/10 21:43:50 kmacleod Exp $ +# + +# This is the example script in the XML::PatAct::ToObjects module doc, +# it also uses XML::PatAct::MatchName and is an example of using PatAct +# modules. + +use XML::Parser::PerlSAX; +use XML::PatAct::MatchName; +use XML::PatAct::ToObjects; + +my $patterns = + [ + 'schema' => [ qw{ -holder } ], + 'table' => [ qw{ -make Schema::Table } ], + 'name' => [ qw{ -field Name -as-string } ], + 'summary' => [ qw{ -field Summary -as-string } ], + 'description' => [ qw{ -field Description -grove } ], + 'column' => [ qw{ -make Schema::Column -push-field Columns } ], + 'unique' => [ qw{ -field Unique -value 1 } ], + 'non-null' => [ qw{ -field NonNull -value 1 } ], + 'default' => [ qw{ -field Default -as-string } ], + ]; + +my $matcher = XML::PatAct::MatchName->new( Patterns => $patterns ); +my $handler = XML::PatAct::ToObjects->new( Patterns => $patterns, + Matcher => $matcher); + +my $parser = XML::Parser::PerlSAX->new( Handler => $handler ); +$schema = $parser->parse(Source => { SystemId => $ARGV[0] } ); + +require 'dumpvar.pl'; +dumpvar('main', 'schema'); diff --git a/examples/schema.xml b/examples/schema.xml new file mode 100644 index 0000000..ffbbfaf --- /dev/null +++ b/examples/schema.xml @@ -0,0 +1,16 @@ + + + MyTable + A short summary + A long description that may + contain a subset of HTML + + MyColumn1 + A short summary + A long description + + + 42 + +
+
diff --git a/lib/Data/Grove.pm b/lib/Data/Grove.pm new file mode 100644 index 0000000..a198987 --- /dev/null +++ b/lib/Data/Grove.pm @@ -0,0 +1,120 @@ +# +# Copyright (C) 1999 Ken MacLeod +# Data::Grove is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Grove.pm,v 1.6 1999/12/22 21:15:00 kmacleod Exp $ +# + +### +### For a similar package, see also: +### +### Graph::Element -- elements for a directed graph +### Neil Bowers (NIELB) +### + +package Data::Grove; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + if (defined $self->{Raw}) { + # clone the raw object + $self = { %{ $self->{Raw} } }; + } + + return bless $self, $type; +} + +package Data::Grove::Characters; +use vars qw{ @ISA $type_name }; +@ISA = qw{Data::Grove}; +$type_name = 'characters'; + +1; + +__END__ + +=head1 NAME + +Data::Grove -- support for deeply nested structures + +=head1 SYNOPSIS + + use Data::Grove; + + $object = MyPackage->new; + + package MyPackage; + @ISA = qw{Data::Grove}; + +=head1 DESCRIPTION + +C provides support for deeply nested tree or graph +structures. C is intended primarily for Perl module +authors writing modules with many types or classes of objects that +need to be manipulated and extended in a consistent and flexible way. + +C is best used by creating a core set of ``data'' classes +and then incrementally adding functionality to the core data classes +by using ``extension'' modules. One reason for this design is so that +the data classes can be swapped out and the extension modules can work +with new data sources. For example, these other data sources could be +disk-based, network-based or built on top of a relational database. + +Two extension modules that come with C are +C and C. +C adds a `C' property to grove objects +and implements a `C' method to grove objects to return the root +node of the tree from anywhere in the tree and a `C' method +to return a list of nodes between the root node and ``this'' node. +C adds callback methods `C' and +`C' that call your handler or receiver module back by +object type name or the object's name. + +C objects do not contain parent references, Perl garbage +collection will delete them when no longer referenced and +sub-structures can be shared among several structures. +C is used to create temporary objects with parent +pointers. + +Properties of data classes are accessed directly using Perl's hash +functions (i.e. `C<$object-E{Property}>'). Extension modules may +also define properties that they support or use, for example +Data::Grove::Parent adds `C' and `C' properties and +Visitor depends on `C' and `C' properties. + +See the module C for an example implementation of +C. + +=head1 METHODS + +=over 4 + +=item new( PROPERTIES ) + +Return a new object blessed into the SubClass, with the given +properties. PROPERTIES may either be a list of key/value pairs, a +single hash containing key/value pairs, or an existing C +object. If an existing C is passed to `C', a +shallow copy of that object will be returned. A shallow copy means +that you are returned a new object, but all of the objects underneath +still refer to the original objects. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1) + +=cut diff --git a/lib/Data/Grove/Parent.pm b/lib/Data/Grove/Parent.pm new file mode 100644 index 0000000..a68c5ed --- /dev/null +++ b/lib/Data/Grove/Parent.pm @@ -0,0 +1,384 @@ +# +# Copyright (C) 1998,1999 Ken MacLeod +# Data::Grove::Parent is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Parent.pm,v 1.2 1999/12/22 21:15:00 kmacleod Exp $ +# + +### +### WARNING +### +### +### This code has a bug in it that renders it useless. In the FETCH +### routines, the new object created should have a reference to the +### the tied object that has $self as the underlying value. As of +### this version, I don't know of a way to get to the tied object. +### + +# Search for places marked `VALIDATE' to see where validation hooks +# may be added in the future. + +use strict; + +#-------------------------------------------------------------------------- +# Data::Grove::Parent +#-------------------------------------------------------------------------- + +package Data::Grove::Parent; + +use UNIVERSAL; +use Carp; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $raw = shift; + my $parent = shift; + + if (UNIVERSAL::isa($raw, 'Data::Grove::Parent')) { + return $raw; + } + + my @properties = ( Raw => $raw ); + + if (defined $parent) { + push @properties, Parent => $parent; + } + + my $dummy = bless {}, ref($raw); + tie %$dummy, $type, @properties; + return $dummy; +} + +sub TIEHASH { + my $type = shift; + + return bless { @_ }, $type; +} + +sub STORE { + my $self = shift; + my $key = shift; + my $value = shift; + + if (exists $self->{$key}) { + $self->{$key} = $value; + } else { + # VALIDATE + if (UNIVERSAL::isa($value, 'Data::Grove::Parent')) { + $value = $value->{Raw}; + } elsif (UNIVERSAL::isa($value, 'Data::Grove::ParentList')) { + $value = $value->[0]; + } + $self->{Raw}{$key} = $value; + } +} + +sub FETCH { + my $self = shift; + my $key = shift; + + if (exists $self->{$key}) { + return $self->{$key}; + } else { + my $value = $self->{Raw}{$key}; + if (ref($value) eq 'ARRAY') { + $value = Data::Grove::ParentList->new($value, $self); + } + return $value; + } +} + +sub FIRSTKEY { + my $self = shift; + my $raw = $self->{Raw}; + + $self->{'__each_in_raw'} = 1; + my $a = scalar keys %$raw; + each %$raw; +} + +sub NEXTKEY { + my $self = shift; + my $raw = $self->{Raw}; + + my ($key, $value); + if ($self->{'__each_in_raw'}) { + if (($key, $value) = each %$raw) { + return $key; + } + delete $self->{'__each_in_raw'}; + my $a = scalar keys %$self; + } + + return each %$self; +} + +sub EXISTS { + my $self = shift; + my $key = shift; + + return (exists $self->{Raw}{$key}) + || (exists $self->{$key}); +} + + +sub DELETE { + my $self = shift; + my $key = shift; + + if (exists $self->{$key}) { + croak "can't delete \`Parent' or \`Raw' properties\n" + if ($key eq 'Parent' || $key eq 'Raw'); + delete $self->{$key}; + } else { + delete $self->{'Raw'}{$key}; + } +} + +sub CLEAR { + my $self = shift; + + %{ $self->{Raw} } = (); +} + +#-------------------------------------------------------------------------- +# Data::Grove::ParentList +#-------------------------------------------------------------------------- + +package Data::Grove::ParentList; + +use UNIVERSAL; + +sub new { + my $type = shift; + my $raw = shift; + my $parent = shift; + + if (UNIVERSAL::isa($raw, 'Data::Grove::ParentList')) { + return $raw; + } + + my $dummy = []; + tie @$dummy, $type, $raw, $parent; + return $dummy; +} + +sub TIEARRAY { + my $type = shift; + + return bless [ @_ ], $type; +} + +sub FETCHSIZE { + scalar @{$_[0][0]}; +} + +sub STORESIZE { + $#{$_[0][0]} = $_[1]-1; +} + +sub STORE { + my $self = shift; + my $index = shift; + my $value = shift; + + # VALIDATE + if (UNIVERSAL::isa($value, 'Data::Grove::Parent')) { + $value = $value->{Raw}; + } elsif (UNIVERSAL::isa($value, 'Data::Grove::ParentList')) { + $value = $value->[0]; + } + $self->[0][$index] = $value; +} + +sub FETCH { + my $self = shift; + my $index = shift; + + my $value = $self->[0][$index]; + if (defined $value) { + if (ref($value)) { + return Data::Grove::Parent->new($value, $self->[1]); + } else { + return Data::Grove::Parent->new({ Data => $value }, $self->[1]); + } + } + + return $value; +} + +sub CLEAR { + @{$_[0][0]} = (); +} + +sub POP { + pop(@{$_[0][0]}); +} + +sub PUSH { + my $o = shift; + + foreach my $value (@_) { + # VALIDATE + if (UNIVERSAL::isa($value, 'Data::Grove::Parent')) { + $value = $value->{Raw}; + } elsif (UNIVERSAL::isa($value, 'Data::Grove::ParentList')) { + $value = $value->[0]; + } + } + push(@{$o->[0]},@_); +} + +sub SHIFT { + shift(@{$_[0][0]}); +} + +sub UNSHIFT { + my $o = shift; + + foreach my $value (@_) { + # VALIDATE + if (UNIVERSAL::isa($value, 'Data::Grove::Parent')) { + $value = $value->{Raw}; + } elsif (UNIVERSAL::isa($value, 'Data::Grove::ParentList')) { + $value = $value->[0]; + } + } + unshift(@{$o->[0]},@_); +} + +sub SPLICE +{ + my $ob = shift; + my $sz = $ob->FETCHSIZE; + my $off = @_ ? shift : 0; + $off += $sz if $off < 0; + my $len = @_ ? shift : $sz-$off; + + foreach my $value (@_) { + # VALIDATE + if (UNIVERSAL::isa($value, 'Data::Grove::Parent')) { + $value = $value->{Raw}; + } elsif (UNIVERSAL::isa($value, 'Data::Grove::ParentList')) { + $value = $value->[0]; + } + } + return splice(@{$ob->[0]},$off,$len,@_); +} + +#-------------------------------------------------------------------------- +# Data::Grove +#-------------------------------------------------------------------------- + +package Data::Grove; + +sub root { + my $self = shift; + + return $self + if !defined $self->{Parent}; + + return $self->{Parent}->root(@_); +} + +sub rootpath { + my $self = shift; + + if (defined $self->{Parent}) { + return ($self->{Parent}->rootpath, $self); + } else { + return ($self); + } +} + +sub add_magic { + my $self = shift; + my $parent = shift; + + return Data::Grove::Parent->new($self, $parent); +} + +1; + +__END__ + +=head1 NAME + +Data::Grove::Parent - provide parent properties to Data::Grove objects + +=head1 SYNOPSIS + + use Data::Grove::Parent; + + $root = $object->root; + $rootpath = $object->rootpath; + $tied = $object->add_magic([ $parent ]); + + $node = Data::Grove::Parent->new($hash [, $parent]); + $node_list = Data::Grove::ParentList->new($array [, $parent]); + +=head1 DESCRIPTION + +Data::Grove::Parent is an extension to Data::Grove that adds +`C' and `C' properties to Data::Grove objects and methods +for returning the root node of a grove, a list of nodes between and +including the root node and the current node, and a method that +creates parented nodes. + +Data::Grove::Parent works by creating a Perl ``tied'' object that +contains a parent reference (`C') and a reference to the +original Data::Grove object (`C'). Tying-magic is used so that +every time you reference the Data::Grove::Parent object it actually +references the underlying raw object. + +When you retrieve a list or a property of the Raw object, +Data::Grove::Parent automatically adds magic to the returned list or +node. This means you only call `add_magic()' once to create the first +Data::Grove::Parent object and then use the grove objects like you +normally would. + +The most obvious use of this is so you don't have to call a +`C' method when you want to release a grove or part of a +grove; since Data::Grove and Data::Grove::Parent objects have no +cyclic references, Perl can garbage collect them normally. + +A secondary use is to allow you to reuse grove or property set +fragments in multiple trees. WARNING: Data::Grove currently does not +protect you from creating your B cyclic references! This could +lead to infinite loops if you don't take care to avoid them. + +=head1 METHODS + +=over 4 + +=item $object->root() + +=item $object->rootpath() + +`C' returns the root node if `C<$object>' is a +`C' object. `C' returns an array of +all the nodes between and including the root node and `C<$object>'. + +=item $tied = $object->add_magic([ $parent ]) + +`C' returns a C object with +`C<$object>' as it's `C' object. If `C<$parent>' is given, that +becomes the tied object's parent object. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), Data::Grove(3) + +=cut diff --git a/lib/Data/Grove/Visitor.pm b/lib/Data/Grove/Visitor.pm new file mode 100644 index 0000000..6cf2e9f --- /dev/null +++ b/lib/Data/Grove/Visitor.pm @@ -0,0 +1,212 @@ +# +# Copyright (C) 1998,1999 Ken MacLeod +# Data::Grove::Visitor is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Visitor.pm,v 1.6 2000/03/20 23:06:45 kmacleod Exp $ +# + +use strict; +use 5.005; + +package Data::Grove::Visitor; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +# The following methods extend Data::Grove +package Data::Grove; + +sub accept { + my $self = shift; + my $visitor = shift; + + my $type_name; + my $package = ref($self); + eval "\$type_name = \$${package}::type_name"; + if (!defined $type_name) { + return (); # no action + } + + my $method_name = 'visit_' . $type_name; + if ($visitor->can($method_name)) { + return $visitor->$method_name ($self, @_); + } else { + return (); # no action + } +} + +sub accept_name { + my $self = shift; + + if (!defined $self->{Name}) { + return $self->accept (@_); + } + + my $visitor = shift; + + my $name = $self->{Name}; + $name =~ s/\W/_/g; + my $name_method = "visit_name_$name"; + + if (!$self->{'has'}{$name_method}) { + return if (defined $self->{'has'}{$name_method}); + $self->{'has'}{$name_method} = $visitor->can($name_method); + return $self->accept($visitor, @_) if (!$self->{'has'}{$name_method}); + } + + return $visitor->$name_method ($self, @_); +} + +sub attr_accept { + my $self = shift; my $attr = shift; my $visitor = shift; + + if (!defined $self->{Attributes}) { + return (); # no action + } + + my $attrs = $self->{Attributes}{$attr}; + if (ref($attrs) eq 'ARRAY') { + return $self->_children_accept ($attrs, $visitor, @_); + } else { + + if (!$self->{has_visit_characters}) { + return if (defined $self->{has_visit_characters}); + $self->{has_visit_characters} = $visitor->can('visit_characters'); + return if (!$self->{has_visit_characters}); + } + # FIXME should be some other generic than XML::Grove::Characters + return $visitor->visit_characters (XML::Grove::Characters->new(Data => $attrs), @_); + } +} + +sub children_accept { + my $self = shift; + + if (defined $self->{Contents}) { + return $self->_children_accept ($self->{Contents}, @_); + } else { + return (); # no action + } +} + +sub children_accept_name { + my $self = shift; + + if (defined $self->{Contents}) { + return $self->_children_accept_name ($self->{Contents}, @_); + } else { + return (); # no action + } +} + +sub _children_accept { + my $self = shift; my $array = shift; my $visitor = shift; + + my @return; + my $ii; + for ($ii = 0; $ii <= $#$array; $ii ++) { + push @return, $array->[$ii]->accept ($visitor, @_); + } + + return @return; +} + +sub _children_accept_name { + my $self = shift; my $array = shift; my $visitor = shift; + + my @return; + my $ii; + for ($ii = 0; $ii <= $#$array; $ii ++) { + push @return, $array->[$ii]->accept_name ($visitor, @_); + } + + return @return; +} + +1; + +__END__ + +=head1 NAME + +Data::Grove::Visitor - add visitor/callback methods to Data::Grove objects + +=head1 SYNOPSIS + + use Data::Grove::Visitor; + + @results = $object->accept ($visitor, ...); + @results = $object->accept_name ($visitor, ...); + @results = $object->children_accept ($visitor, ...); + @results = $object->children_accept_name ($visitor, ...); + +=head1 DESCRIPTION + +Data::Grove::Visitor adds visitor methods (callbacks) to Data::Grove +objects. A ``visitor'' is a class (a package) you write that has +methods (subs) corresponding to the objects in the classes being +visited. You use the visitor methods by creating an instance of your +visitor class, and then calling `C' on the +top-most object you want to visit, that object will in turn call your +visitor back with `C>', where I is the type of +object. + +There are several forms of `C'. Simply calling `C' +calls your package back using the object type of the object you are +visiting. Calling `C' on an element object calls you +back with `C>' where I is the tag name of the +element, on all other objects it's as if you called `C'. + +All of the forms of `C' return a concatenated list of the +result of all `C' methods. + +`C' calls `C' on each of the children of the +element. This is generally used in element callbacks to recurse down +into the element's children, you don't need to get the element's +contents and call `C' on each item. `C' +does the same but calling `C' on each of the children. +`C' calls `C' on each of the objects in the named +attribute. + +Refer to the documentation of the classes you are visiting +(XML::Grove, etc.) for the type names (`C', `C', +etc.) of the objects it implements. + +=head1 RESERVED NAMES + +The hash keys `C' and `C' are used to indicate objects +with children (for `C') and named objects (for +`C'). + +=head1 NOTES + +These are random ideas that haven't been implemented yet: + +=over 4 + +=item * + +Several objects fall into subclasses, or you may want to be able to +subclass a visited object and still be able to tell the difference. +In SGML::Grove I had used the package name in the callback +(`C') instead of a generic name +(`C'). The idea here would be to try calling +`C>' with the most specific class first, then try +superclasses, and lastly to try the generic. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), Data::Grove + +Extensible Markup Language (XML) + +=cut diff --git a/lib/XML/ESISParser.pm b/lib/XML/ESISParser.pm new file mode 100644 index 0000000..f65b3aa --- /dev/null +++ b/lib/XML/ESISParser.pm @@ -0,0 +1,739 @@ +# +# Copyright (C) 1999 Ken MacLeod +# See the file COPYING for distribution terms. +# +# $Id: ESISParser.pm,v 1.9 2000/03/02 20:18:09 kmacleod Exp $ +# + +use strict; + +use IO::File; +use UNIVERSAL; + +package XML::ESISParser; + +use vars qw{ $VERSION $NSGMLS_sgml $NSGMLS_FLAGS_sgml $NSGMLS_ENV_sgml + $NSGMLS_xml $NSGMLS_FLAGS_xml $NSGMLS_ENV_xml + $XML_DECL }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +$NSGMLS_sgml = 'nsgmls'; +$NSGMLS_FLAGS_sgml = '-oentity -oempty -onotation-sysid -oincluded -oline -E0'; +$NSGMLS_ENV_sgml = ''; + +$NSGMLS_xml = 'nsgmls'; +$XML_DECL = '/usr/lib/sgml/declaration/xml.decl'; +$NSGMLS_FLAGS_xml = '-oentity -oempty -onotation-sysid -oline -oincluded -wxml -E0 '; +$NSGMLS_ENV_xml = 'SP_CHARSET_FIXED=YES SP_ENCODING=XML'; + +sub new { + my $type = shift; + + return bless { @_ }, $type; +} + +sub parse { + my $self = shift; + + die "XML::ESISParser: parser instance ($self) already parsing\n" + if (defined $self->{ParseOptions}); + + # If there's one arg and it has no ref, it's a string + my $args; + if (scalar (@_) == 1 && !ref($_[0])) { + $args = { Source => { String => shift } }; + } else { + $args = (scalar (@_) == 1) ? shift : { @_ }; + } + + my $parse_options = { %$self, %$args }; + $self->{ParseOptions} = $parse_options; + + # ensure that we have at least one source + if (!defined $parse_options->{Source} + || !(defined $parse_options->{Source}{String} + || defined $parse_options->{Source}{ByteStream} + || defined $parse_options->{Source}{SystemId} + || defined $parse_options->{Source}{ESISStream})) { + die "XML::ESISParser: no source defined for parse\n"; + } + + # assign default Handler to any undefined handlers + if (defined $parse_options->{Handler}) { + $parse_options->{DocumentHandler} = $parse_options->{Handler} + if (!defined $parse_options->{DocumentHandler}); + $parse_options->{DTDHandler} = $parse_options->{Handler} + if (!defined $parse_options->{DTDHandler}); + $parse_options->{ErrorHandler} = $parse_options->{Handler} + if (!defined $parse_options->{ErrorHandler}); + } + + # create the NSGMLS command + my ($nsgmls_command, $nsgmls, $nsgmls_flags); + if (defined $parse_options->{NSGMLSCommand}) { + $nsgmls_command = $parse_options->{NSGMLSCommand}; + } elsif (defined $parse_options->{IsSGML} + && $parse_options->{IsSGML}) { + my $declaration = (defined $parse_options->{Declaration}) + ? " " . $parse_options->{Declaration} : ""; + $nsgmls = $parse_options->{NSGMLS} = $NSGMLS_sgml; + $nsgmls_flags = $parse_options->{NSGMLS_FLAGS} = $NSGMLS_FLAGS_sgml; + $nsgmls_command = $parse_options->{NSGMLS_COMMAND} = "$nsgmls $nsgmls_flags $declaration"; + } else { + my $declaration = (defined $parse_options->{Declaration}) + ? $parse_options->{Declaration} : $XML_DECL; + $nsgmls = $parse_options->{NSGMLS} = $NSGMLS_xml; + $nsgmls_flags = $parse_options->{NSGMLS_FLAGS} = $NSGMLS_FLAGS_xml; + $nsgmls_command = $parse_options->{NSGMLS_COMMAND} = "$NSGMLS_ENV_xml $nsgmls $nsgmls_flags $declaration"; + } + + + my $result; + if (defined $self->{ParseOptions}{Source}{ESISStream}) { + # read ESIS stream directly + my $system_id = (defined $self->{ParseOptions}{Source}{SystemId}) + ? "\`$self->{ParseOptions}{Source}{SystemId}'" : 'ESIS Stream'; + eval { $result = $self->parse_fh ($self->{ParseOptions}{Source}{ESISStream}) }; + my $retval = $@; + + if ($retval) { + die "XML::ESISParser::parse: unable to parse \`$system_id'\n$retval"; + } + } elsif (defined $self->{ParseOptions}{Source}{ByteStream}) { + # call nsgmls using file handle + # FIXME special case stdin? + + # For ByteStreams (Perl file handles) we create a sub-process + # that we feed the XML/SGML document and we get back the ESIS + # stream + my $retval; + my $system_id = (defined $self->{ParseOptions}{Source}{SystemId}) + ? "\`$self->{ParseOptions}{Source}{SystemId}'" : 'Byte Stream'; + my ($pid) = open (ESIS, "-|"); + if ($pid == 0) { + # 20% speed increase if grep swipes implieds (only 8% if + # we do it in `parse_fh'). XXX use a C routine or patch SP + open (SGML, "| $nsgmls_command 2>&1 | egrep -v '^A.* IMPLIED\$'") + or die "XML::ESISParser::parse: can't run \`$nsgmls' on \`$system_id'\n"; + + $self->{ParseOptions}{Source}{ByteStream}->print (*SGML); + + close (SGML) + or die "XML::ESISParser::parse: can't run \`$nsgmls' on \`$system_id'\n"; + + exit 0; + } else { + eval { $result = $self->parse_fh (*ESIS) }; + $retval = $@; + wait; # clean up that process + } + close (ESIS); + + $self->{ParseOptions}{Source}{ByteStream}->close (); + + if ($retval) { + die "XML::ESISParser::parse: unable to parse \`$system_id'\n$retval"; + } + } elsif (defined $self->{ParseOptions}{Source}{String}) { + # call nsgmls with a literal string + } elsif (defined $self->{ParseOptions}{Source}{SystemId}) { + # if SystemId is a file, call nsgmls with file name + # otherwise, open stream on SystemId and do ByteStream + + # FIXME this only handles file SystemIds right now + # 20% speed increase if grep swipes implieds (only 8% if + # we do it in `parse'). XXX use a C routine or patch SP + my $system_id = $self->{ParseOptions}{Source}{SystemId}; + my ($fh) = IO::File->new + ("$nsgmls_command '$system_id' 2>&1 | egrep -v '^A.* IMPLIED\$' |"); + die "XML::ESISParser::parse: can't run \`$nsgmls' on \`$system_id'\n" + if (!defined $fh); + + eval { $result = $self->parse_fh ($fh) }; + my $retval = $@; + + close ($fh); + + if ($retval) { + die "XML::ESISParser::parse: unable to parse \`$system_id'\n$retval"; + } + } + + + # clean up parser instance + delete $self->{ParseOptions}; + delete $self->{DocumentHandler}; + delete $self->{DTDHandler}; + delete $self->{ErrorHandler}; + + return $result; +} + +# +# Parse the `ESIS' information coming from `file' +# + +sub parse_fh { + my ($self, $file) = @_; + my (@attributes, @properties, $files); + + my $doc_h = $self->{ParseOptions}{DocumentHandler}; + my $dtd_h = $self->{ParseOptions}{DTDHandler}; + my $err_h = $self->{ParseOptions}{ErrorHandler}; + + # we cache these most commonly used `can()' calls + my $can_start_element = $doc_h->can('start_element'); + my $can_end_element = $doc_h->can('end_element'); + my $can_characters = $doc_h->can('characters'); + my $can_record_end = $doc_h->can('record_end'); + + my $line = 0; + $doc_h->start_document( { } ) + if ($doc_h->can('start_document')); + + # 30% speed improvement by breaking the encapsulation + my ($is_filehandle) = (ref ($file) eq "FileHandle" + || ref ($file) eq "IO::File"); + while ($_ = ($is_filehandle ? <$file> : $file->getline())) { + chop; + + if (/^A/) { # attribute + # Note: the output of `nsgmls' is `grep -v'ed to get rid of + # IMPLIED attributes, if we do it here we only get an 8% + # speed boost + + my ($name, $type, $value) = split (/\s/, $', 3); + + push (@attributes, $name => $value); + + next; + } + + if (/^\(/) { # start element + # break the encapsulation for an 8% boost + if ($#attributes >= 0) { + push (@properties, Attributes => { @attributes }); + } + $doc_h->start_element ({ Name => $', @properties }) + if ($can_start_element); + + @properties = (); @attributes = (); + next; + } + + if (/^\)/) { # end element + $doc_h->end_element ({ Name => $' }) + if ($can_end_element); + + next; + } + + if (/^L/) { # line number + $line = $'; + + next; + } + + if (/^-/) { # data (including sdata entities) + # This section is derived from David Megginson's SGMLSpm + my $sdata_flag = 0; + my $out = ''; + my $data = $'; + + while ($data =~ /\\(\\|n|\||[0-7]{1,3})/) { + $out .= $`; + $data = $'; + + if ($1 eq '|') { + # beginning or end of SDATA + if ("$out" ne '') { + if ($sdata_flag) { + $doc_h->internal_entity_ref({ Name => $self->{'internal_entities_by_value'}{$out} }) + if ($doc_h->can('internal_entity_ref')); + } else { + $doc_h->characters({ Data => $out }) + if ($can_characters); + } + $out = ''; + } + $sdata_flag = !$sdata_flag; + + } elsif ($1 eq 'n') { + # record end + if ("$out" ne '') { + if ($sdata_flag) { + $doc_h->internal_entity_ref({ Name => $self->{'internal_entities_by_value'}{$out} }) + if ($doc_h->can('internal_entity_ref')); + } else { + $doc_h->characters({ Data => $out }) + if ($can_characters); + } + $out = ''; + } + if ($can_record_end) { + $doc_h->record_end( { } ); + } else { + $doc_h->characters({ Data => "\n" }) + if ($can_characters); + } + } elsif ($1 eq '\\') { + $out .= '\\'; + } else { + $out .= chr(oct($1)); + } + } + $out .= $data; + if ("$out" ne '') { + if ($sdata_flag) { + $doc_h->internal_entity_ref({ Name => $self->{'internal_entities_by_value'}{$out} }) + if ($doc_h->can('internal_entity_ref')); + } else { + $doc_h->characters({ Data => $out }) + if ($can_characters); + } + } + + next; + } + + if (/^s/) { # sysid + push (@properties, SystemId => $'); + + next; + } + + if (/^p/) { # pubid + push (@properties, PublicId => $'); + + next; + } + + if (/^f/) { # file + if (!defined $files) { + $files = $'; + } elsif (!ref $files) { + $files = [ $files, $' ]; + } else { + push (@$files, $'); + } + + next; + } + + if (/^E/) { # external entity definition + my ($entity_data) = $'; + $entity_data =~ /^(\S+) (\S+) (\S+)$/ + or die "XML::ESISParser::parse_fh: bad external entity event data: $entity_data\n"; + my ($name,$type,$notation) = ($1,$2,$3); + if (defined $files) { + push (@properties, GeneratedId => $files); + } + $dtd_h->external_entity_decl ({ Name => $name, Type => $type, + Notation => $notation, @properties }) + if ($dtd_h->can('external_entity_decl')); + + @properties = (); undef $files; + next; + } + + if (/^I/) { # internal entity definition + my ($name, $type, $value) = split (/\s/, $', 3); + $self->{'internal_entities_by_value'}{$value} = $name; + $dtd_h->internal_entity_decl ({ Name => $name, Type => $type, + Value => $value }) + if ($dtd_h->can('internal_entity_decl')); + + next; + } + + if (/^&/) { # external entity reference + my ($name) = $'; + $doc_h->external_entity_ref({ Name => $name }) + if ($doc_h->can('external_entity_ref')); + + next; + } + + if (/^\?/) { # processing instruction (PI) + my ($data) = $'; + if ($self->{ParseOptions}{IsSGML}) { + $doc_h->processing_instruction({ Data => $data }) + if ($doc_h->can('processing_instruction')); + } else { + my ($target, $pi_data) = split (/\s+/, $data, 2); + $doc_h->processing_instruction({ Target => $target, Data => $pi_data }) + if ($doc_h->can('processing_instruction')); + } + + next; + } + + if (/^N/) { # notation definition + my ($name) = $'; + if (defined $files) { + push (@properties, GeneratedId => $files); + } + $dtd_h->notation_decl ({ Name => $name, @properties }) + if ($dtd_h->can('notation_decl')); + + @properties = (); undef $files; + next; + } + + if (/^S/) { # subdoc definition + my ($name) = $'; + if (defined $files) { + push (@properties, GeneratedId => $files); + } + $dtd_h->subdoc_entity_decl ({ Name => $name, @properties }) + if ($dtd_h->can('subdoc_entity_decl')); + + @properties = (); undef $files; + next; + } + + if (/^T/) { # external SGML text entity definition + my ($name) = $'; + if (defined $files) { + push (@properties, GeneratedId => $files); + } + $dtd_h->external_sgml_entity_decl ({ Name => $name, @properties }) + if ($dtd_h->can('external_sgml_entity_decl')); + + @properties = (); undef $files; + next; + } + + if (/^D/) { # data attribute + # FIXME + my $message = "XML::ESISParser: can't handle data attributes yet\n"; + if ($err_h->can('error')) { + $err_h->error ({ Message => $message }); + } else { + die "$message"; + } + + next; + } + + if (/^D/) { # link attribute + # FIXME + my $message = "XML::ESISParser: can't handle link attributes yet\n"; + if ($err_h->can('error')) { + $err_h->error ({ Message => $message }); + } else { + die "$message"; + } + + next; + } + + if (/^{/) { # subdoc start + my ($name) = $'; + $doc_h->start_subdoc ({ Name => $name }) + if ($doc_h->can('start_subdoc')); + + next; + } + + if (/^}/) { # subdoc end + my ($name) = $'; + $doc_h->end_subdoc ({ Name => $name }) + if ($doc_h->can('end_subdoc')); + + next; + } + + if (/^#/) { # appinfo + my ($text) = $'; + $doc_h->appinfo ({ Text => $text }) + if ($doc_h->can('appinfo')); + + next; + } + + if (/^i/) { # next element is an included subelement + push (@properties, IncludedSubelement => 1); + + next; + } + + if (/^e/) { # next element is declared empty + push (@properties, Empty => 1); + + next; + } + + if (/^C/) { # conforming + $doc_h->conforming({}) + if ($doc_h->can('conforming')); + + next; + } + + if (/^$self->{ParseOptions}{NSGMLS}:/) { # `nsgmls' error + my $message = $_; + if ($err_h->can('error')) { + $err_h->error ({ Message => $message }); + } else { + die "$message\n"; + } + + next; + } + + my ($op) = substr ($_, 0, 1); + my $message = "XML::ESISParser::parse_fh: ESIS command character \`$op' not recognized when reading line \`$_' around line $line ($.)"; + if ($err_h->can('error')) { + $err_h->error ({ Message => $message }); + } else { + die "$message"; + } + } + + if ($doc_h->can('end_document')) { + return $doc_h->end_document({}); + } else { + return (); + } +} + +1; + +__END__ + +=head1 NAME + +XML::ESISParser - Perl SAX parser using nsgmls + +=head1 SYNOPSIS + + use XML::ESISParser; + + $parser = XML::ESISParser->new( [OPTIONS] ); + $result = $parser->parse( [OPTIONS] ); + + $result = $parser->parse($string); + +=head1 DESCRIPTION + +C is a Perl SAX parser using the `nsgmls' command of +James Clark's SGML Parser (SP), a validating XML and SGML parser. +This man page summarizes the specific options, handlers, and +properties supported by C; please refer to the Perl +SAX standard in `C' for general usage information. + +C defaults to parsing XML and has an option for +parsing SGML. + +`C' source, and binaries for some platforms, is available from +. `C' is included in both the SP and +Jade packages. + +=head1 METHODS + +=over 4 + +=item new + +Creates a new parser object. Default options for parsing, described +below, are passed as key-value pairs or as a single hash. Options may +be changed directly in the parser object unless stated otherwise. +Options passed to `C' override the default options in the +parser object for the duration of the parse. + +=back + +=head1 OPTIONS + +The following options are supported by C: + + Handler default handler to receive events + DocumentHandler handler to receive document events + DTDHandler handler to receive DTD events + ErrorHandler handler to receive error events + Source hash containing the input source for parsing + IsSGML the document to be parsed is in SGML + +If no handlers are provided then all events will be silently ignored. + +If a single string argument is passed to the `C' method, it +is treated as if a `C' option was given with a `C' +parameter. + +The `C' hash may contain the following parameters: + + ByteStream The raw byte stream (file handle) containing the + document. + String A string containing the document. + SystemId The system identifier (URI) of the document. + +If more than one of `C', `C', or `C', +then preference is given first to `C', then `C', +then `C'. + +=head1 HANDLERS + +The following handlers and properties are supported by +C: + +=head2 DocumentHandler methods + +=over 4 + +=item start_document + +Receive notification of the beginning of a document. + +No properties defined. + +=item end_document + +Receive notification of the end of a document. + +No properties defined. + +=item start_element + +Receive notification of the beginning of an element. + + Name The element type name. + Attributes A hash containing the attributes attached to the + element, if any. + IncludedSubelement This element is an included subelement. + Empty This element is declared empty. + +The `C' hash contains only string values. The `C' +flag is not set for an element that merely has no content, it is set +only if the DTD declares it empty. + +BETA: Attribute values currently do not expand SData entities into +entity objects, they are still in the system data notation used by +nsgmls (inside `|'). A future version of XML::ESISParser will also +convert other types of attributes into their respective objects, +currently just their notation or entity names are given. + +=item end_element + +Receive notification of the end of an element. + + Name The element type name. + +=item characters + +Receive notification of character data. + + Data The characters from the document. + +=item record_end + +Receive notification of a record end sequence. XML applications +should convert this to a new-line. + +=item processing_instruction + +Receive notification of a processing instruction. + + Target The processing instruction target in XML. + Data The processing instruction data, if any. + +=item internal_entity_ref + +Receive notification of a system data (SData) internal entity +reference. + + Name The name of the internal entity reference. + +=item external_entity_ref + +Receive notification of a external entity reference. + + Name The name of the external entity reference. + +=item start_subdoc + +Receive notification of the start of a sub document. + + Name The name of the external entity reference. + +=item end_subdoc + +Receive notification of the end of a sub document. + + Name The name of the external entity reference. + +=item conforming + +Receive notification that the document just parsed conforms to it's +document type declaration (DTD). + +No properties defined. + +=back + +=head2 DTDHandler methods + +=over 4 + +=item external_entity_decl + +Receive notification of an external entity declaration. + + Name The entity's entity name. + Type The entity's type (CDATA, NDATA, etc.) + SystemId The entity's system identifier. + PublicId The entity's public identifier, if any. + GeneratedId Generated system identifiers, if any. + +=item internal_entity_decl + +Receive notification of an internal entity declaration. + + Name The entity's entity name. + Type The entity's type (CDATA, NDATA, etc.) + Value The entity's character value. + +=item notation_decl + +Receive notification of a notation declaration. + + Name The notation's name. + SystemId The notation's system identifier. + PublicId The notation's public identifier, if any. + GeneratedId Generated system identifiers, if any. + +=item subdoc_entity_decl + +Receive notification of a subdocument entity declaration. + + Name The entity's entity name. + SystemId The entity's system identifier. + PublicId The entity's public identifier, if any. + GeneratedId Generated system identifiers, if any. + +=item external_sgml_entity_decl + +Receive notification of an external SGML-entity declaration. + + Name The entity's entity name. + SystemId The entity's system identifier. + PublicId The entity's public identifier, if any. + GeneratedId Generated system identifiers, if any. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), PerlSAX.pod(3) + + Extensible Markup Language (XML) + SAX 1.0: The Simple API for XML + SGML Parser (SP) + +=cut diff --git a/lib/XML/Handler/CanonXMLWriter.pm b/lib/XML/Handler/CanonXMLWriter.pm new file mode 100644 index 0000000..27ef03f --- /dev/null +++ b/lib/XML/Handler/CanonXMLWriter.pm @@ -0,0 +1,180 @@ +# +# Copyright (C) 1998, 1999 Ken MacLeod +# XML::Handler::CanonXMLWriter is free software; you can redistribute +# it and/or modify it under the same terms as Perl itself. +# +# $Id: CanonXMLWriter.pm,v 1.2 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::Handler::CanonXMLWriter; +use vars qw{ $VERSION %char_entities }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +%char_entities = ( + "\x09" => ' ', + "\x0a" => ' ', + "\x0d" => ' ', + '&' => '&', + '<' => '<', + '>' => '>', + '"' => '"', +); + +sub new { + my ($class, %args) = @_; + + my $self = \%args; + return bless $self, $class; +} + +sub start_document { + my $self = shift; my $document = shift; + + $self->{'_text_array'} = []; +} + +sub end_document { + my $self = shift; my $document = shift; + + if (defined $self->{IOHandle}) { + return (); + } else { + my $text = join ('', @{$self->{'_text_array'}}); + undef $self->{'_text_array'}; + return $text; + } +} + +sub start_element { + my $self = shift; my $element = shift; + + $self->_print('<' . $element->{Name}); + my $key; + my $attrs = $element->{Attributes}; + foreach $key (sort keys %$attrs) { + $self->_print(" $key=\"" . $self->_escape($attrs->{$key}) . '"'); + } + $self->_print('>'); +} + +sub end_element { + my $self = shift; my $element = shift; + + $self->_print('{Name} . '>'); +} + +sub characters { + my $self = shift; my $characters = shift; + + $self->_print($self->_escape($characters->{Data})); +} + +sub ignorable_whitespace { + my $self = shift; my $characters = shift; + + $self->_print($self->_escape($characters->{Data})); +} + +sub processing_instruction { + my $self = shift; my $pi = shift; + + $self->_print('{Target} . ' ' . $pi->{Data} . '?>'); +} + +sub entity { + # entities don't occur in text + return (); +} + +sub comment { + my $self = shift; my $comment = shift; + + if ($self->{PrintComments}) { + $self->_print(''); + } else { + return (); + } +} + +sub _print { + my $self = shift; my $string = shift; + + if (defined $self->{IOHandle}) { + $self->{IOHandle}->print($string); + return (); + } else { + push @{$self->{'_text_array'}}, $string; + } +} + +sub _escape { + my $self = shift; my $string = shift; + + $string =~ s/([\x09\x0a\x0d&<>"])/$char_entities{$1}/ge; + return $string; +} + +1; + +__END__ + +=head1 NAME + +XML::Handler::CanonXMLWriter - output XML in canonical XML format + +=head1 SYNOPSIS + + use XML::Handler::CanonXMLWriter; + + $writer = XML::Handler::CanonXMLWriter OPTIONS; + $parser->parse(Handler => $writer); + +=head1 DESCRIPTION + +C is a PerlSAX handler that will return +a string or write a stream of canonical XML for an XML instance and it's +content. + +C objects hold the options used for +writing the XML objects. Options can be supplied when the the object +is created, + + $writer = new XML::Handler::CanonXMLWriter PrintComments => 1; + +or modified at any time before calling the parser's `C' method: + + $writer->{PrintComments} = 0; + +=head1 OPTIONS + +=over 4 + +=item IOHandle + +IOHandle contains a handle for writing the canonical XML to. If an +IOHandle is not provided, the canonical XML string will be returned +from `C'. + +=item PrintComments + +By default comments are not written to the output. Setting comment to +a true value will include comments in the output. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), PerlSAX + +James Clark's Canonical XML definition + + +=cut diff --git a/lib/XML/Handler/Sample.pm b/lib/XML/Handler/Sample.pm new file mode 100644 index 0000000..a73d358 --- /dev/null +++ b/lib/XML/Handler/Sample.pm @@ -0,0 +1,101 @@ +# This template file is in the Public Domain. +# You may do anything you want with this file. +# +# $Id: Sample.pm,v 1.4 1999/08/16 16:04:03 kmacleod Exp $ +# + +package XML::Handler::Sample; + +use vars qw{ $AUTOLOAD }; + +sub new { + my $type = shift; + my $self = ( $#_ == 0 ) ? shift : { @_ }; + + return bless $self, $type; +} + +# Basic PerlSAX +sub start_document { print "start_document\n"; } +sub end_document { print "end_document\n"; } +sub start_element { print "start_element\n"; } +sub end_element { print "end_element\n"; } +sub characters { print "characters\n"; } +sub processing_instruction { print "processing_instruction\n"; } +sub ignorable_whitespace { print "ignorable_whitespace\n"; } + +# Additional expat callbacks in XML::Parser::PerlSAX +sub comment { print "comment\n"; } +sub notation_decl { print "notation_decl\n"; } +sub unparsed_entity_decl { print "unparsed_entity_decl\n"; } +sub entity_decl { print "entity_decl\n"; } +sub element_decl { print "element_decl\n"; } +sub doctype_decl { print "doctype_decl\n"; } +sub xml_decl { print "xml_decl\n"; } + +# Additional SP/nsgmls callbacks in XML::ESISParser +sub start_subdoc { print "start_subdoc\n"; } +sub end_subdoc { print "start_subdoc\n"; } +sub appinfo { print "appinfo\n"; } +sub internal_entity_ref { print "sdata\n"; } +sub external_entity_ref { print "sdata\n"; } +sub record_end { print "record_end\n"; } +sub internal_entity_decl { print "internal_entity_decl\n"; } +sub external_entity_decl { print "external_entity_decl\n"; } +sub external_sgml_entity_decl { print "external_sgml_entity_decl\n"; } +sub subdoc_entity_decl { print "subdoc_entity_decl\n"; } +sub notation { print "notation\n"; } +sub error { print "error\n"; } +sub conforming { print "conforming\n"; } + +# Others +sub AUTOLOAD { + my $self = shift; + + my $method = $AUTOLOAD; + $method =~ s/.*:://; + return if $method eq 'DESTROY'; + + print "UNRECOGNIZED $method\n"; +} + +1; + +__END__ + +=head1 NAME + +XML::Handler::Sample - a trivial PerlSAX handler + +=head1 SYNOPSIS + + use XML::Parser::PerlSAX; + use XML::Handler::Sample; + + $my_handler = XML::Handler::Sample->new; + + XML::Parser::PerlSAX->new->parse(Source => { SystemId => 'REC-xml-19980210.xml' }, + Handler => $my_handler); + +=head1 DESCRIPTION + +C is a trivial PerlSAX handler that prints out +the name of each event it receives. The source for +C lists all the currently known PerlSAX +handler methods. + +C is intended for Perl module authors who wish +to look at example PerlSAX handler modules. C +can be used as a template for writing your own PerlSAX handler +modules. C is in the Public Domain and can be +used for any purpose without restriction. + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), PerlSAX.pod(3) + +=cut diff --git a/lib/XML/Handler/Subs.pm b/lib/XML/Handler/Subs.pm new file mode 100644 index 0000000..acbc222 --- /dev/null +++ b/lib/XML/Handler/Subs.pm @@ -0,0 +1,177 @@ +# +# Copyright (C) 1999 Ken MacLeod +# XML::Handler::XMLWriter is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Subs.pm,v 1.2 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::Handler::Subs; + +use UNIVERSAL; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + return bless $self, $type; +} + +sub start_document { + my ($self, $document) = @_; + + $self->{Names} = []; + $self->{Nodes} = []; +} + +sub end_document { + my ($self, $document) = @_; + + delete $self->{Names}; + delete $self->{Nodes}; + + return(); +} + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{Names}}, $element->{Name}; + push @{$self->{Nodes}}, $element; + + my $el_name = "s_" . $element->{Name}; + $el_name =~ s/[^a-zA-Z0-9_]/_/g; + if ($self->can($el_name)) { + $self->$el_name($element); + return 1; + } + + return 0; +} + +sub end_element { + my ($self, $element) = @_; + + my $called_sub = 0; + my $el_name = "e_" . $element->{Name}; + $el_name =~ s/[^a-zA-Z0-9_]/_/g; + if ($self->can(${el_name})) { + $self->$el_name($element); + $called_sub = 1; + } + + pop @{$self->{Names}}; + pop @{$self->{Nodes}}; + + return $called_sub; +} + +sub in_element { + my ($self, $name) = @_; + + return ($self->{Names}[-1] eq $name); +} + +sub within_element { + my ($self, $name) = @_; + + my $count = 0; + foreach my $el_name (@{$self->{Names}}) { + $count ++ if ($el_name eq $name); + } + + return $count; +} + +1; + +__END__ + +=head1 NAME + +XML::Handler::Subs - a PerlSAX handler base class for calling user-defined subs + +=head1 SYNOPSIS + + use XML::Handler::Subs; + + package MyHandlers; + use vars qw{ @ISA }; + + sub s_NAME { my ($self, $element) = @_ }; + sub e_NAME { my ($self, $element) = @_ }; + + $self->{Names}; # an array of names + $self->{Nodes}; # an array of $element nodes + + $handler = MyHandlers->new(); + $self->in_element($name); + $self->within_element($name); + +=head1 DESCRIPTION + +C is a base class for PerlSAX handlers. +C is subclassed to implement complete behavior and +to add element-specific handling. + +Each time an element starts, a method by that name prefixed with `s_' +is called with the element to be processed. Each time an element +ends, a method with that name prefixed with `e_' is called. Any +special characters in the element name are replaced by underscores. + +Subclassing XML::Handler::Subs in this way is similar to +XML::Parser's Subs style. + +XML::Handler::Subs maintains a stack of element names, +`C<$self->{Names}', and a stack of element nodes, `C<$self->{Nodes}>' +that can be used by subclasses. The current element is pushed on the +stacks before calling an element-name start method and popped off the +stacks after calling the element-name end method. The +`C' and `C' calls use these stacks. + +If the subclass implements `C', `C', +`C', and `C', be sure to use +`C' to call the the superclass methods also. See perlobj(1) +for details on SUPER::. `C' and +`C' return 1 if an element-name method is +called, they return 0 if no method was called. + +XML::Handler::Subs does not implement any other PerlSAX handlers. + +XML::Handler::Subs supports the following methods: + +=over 4 + +=item new( I ) + +A basic `C' method. `C' takes a list of key, value +pairs or a hash and creates and returns a hash with those options; the +hash is blessed into the subclass. + +=item in_element($name) + +Returns true if `C<$name>' is equal to the name of the innermost +currently opened element. + +=item within_element($name) + +Returns the number of times the `C<$name>' appears in Names. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), PerlSAX.pod(3) + +=cut diff --git a/lib/XML/Handler/XMLWriter.pm b/lib/XML/Handler/XMLWriter.pm new file mode 100644 index 0000000..6b03ea3 --- /dev/null +++ b/lib/XML/Handler/XMLWriter.pm @@ -0,0 +1,313 @@ +# +# Copyright (C) 1999 Ken MacLeod +# Portions derived from code in XML::Writer by David Megginson +# XML::Handler::XMLWriter is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: XMLWriter.pm,v 1.2 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::Handler::XMLWriter; +use XML::Handler::Subs; + +use vars qw{ $VERSION @ISA $escapes }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +@ISA = qw{ XML::Handler::Subs }; + +$escapes = { '&' => '&', + '<' => '<', + '>' => '>', + '"' => '"' + }; + +sub start_document { + my ($self, $document) = @_; + + $self->SUPER::start_document($document); + + # create a temporary Output_ in case we're creating a standard + # output file that we'll delete later. + if (!$self->{AsString} && !defined($self->{Output})) { + require IO::File; + import IO::File; + $self->{Output_} = new IO::File(">-"); + } elsif (defined($self->{Output})) { + $self->{Output_} = $self->{Output}; + } + + if ($self->{AsString}) { + $self->{Strings} = []; + } + + $self->print("\n"); + + # FIXME support Doctype declarations +} + +sub end_document { + my ($self, $document) = @_; + + if (defined($self->{Output_})) { + $self->{Output_}->print("\n"); + delete $self->{Output_}; + } + + my $string = undef; + if (defined($self->{AsString})) { + push @{$self->{Strings}}, "\n"; + $string = join('', @{$self->{Strings}}); + delete $self->{Strings}; + } + + $self->SUPER::end_document($document); + + return($string); +} + +sub start_element { + my ($self, $element) = @_; + + if ($self->SUPER::start_element($element) == 0) { + $self->print_start_element($element); + } +} + +sub print_start_element { + my ($self, $element) = @_; + + my $output = "<$element->{Name}"; + if (defined($element->{Attributes})) { + foreach my $name (sort keys %{$element->{Attributes}}) { + my $esc_value = $element->{Attributes}{$name}; + $esc_value =~ s/([\&\<\>\"])/$escapes->{$1}/ge; + $output .= " $name=\"$esc_value\""; + } + } + + if ($self->{Newlines}) { + $output .= "\n"; + } + + $output .= ">"; + + $self->print($output); +} + +sub end_element { + my ($self, $element) = @_; + + if ($self->SUPER::end_element($element) == 0) { + $self->print_end_element($element); + } +} + +sub print_end_element { + my ($self, $element) = @_; + + my $output = "{Name}" + . ($self->{Newlines} ? "\n" : "") . ">"; + + $self->print($output); +} +sub characters { + my ($self, $characters) = @_; + + my $output = $characters->{Data}; + + $output =~ s/([\&\<\>])/$escapes->{$1}/ge; + + $self->print($output); +} + +sub processing_instruction { + my ($self, $pi) = @_; + + my $nl = ($#{$self->{Names}} == -1) ? "\n" : ""; + + my $output; + if ($self->{IsSGML}) { + $output = "{Data}>\n"; + } else { + if ($pi->{Data}) { + $output = "{Target} $pi->{Data}?>$nl"; + } else { + $output = "{Target}?>$nl"; + } + } + + $self->print($output); +} + +sub ignorable_whitespace { + my ($self, $whitespace) = @_; + + $self->print($whitespace->{Data}); +} + +sub comment { + my ($self, $comment) = @_; + + my $nl = ($#{$self->{Names}} == -1) ? "\n" : ""; + + my $output = "$nl"; + + $self->print($output); +} + +sub print { + my ($self, $output) = @_; + + $self->{Output_}->print($output) + if (defined($self->{Output_})); + + push(@{$self->{Strings}}, $output) + if (defined($self->{AsString})); +} + +1; + +__END__ + +=head1 NAME + +XML::Handler::XMLWriter - a PerlSAX handler for writing readable XML + +=head1 SYNOPSIS + + use XML::Parser::PerlSAX; + use XML::Handler::XMLWriter; + + $my_handler = XML::Handler::XMLWriter->new( I ); + + XML::Parser::PerlSAX->new->parse(Source => { SystemId => 'REC-xml-19980210.xml' }, + Handler => $my_handler); + +=head1 DESCRIPTION + +C is a PerlSAX handler for writing readable +XML (in contrast to Canonical XML, for example). +XML::Handler::XMLWriter can be used with a parser to reformat XML, +with XML::DOM or XML::Grove to write out XML, or with other PerlSAX +modules that generate events. + +C is intended to be used with PerlSAX event +generators and does not perform any checking itself (for example, +matching start and end element events). If you want to generate XML +directly from your Perl code, use the XML::Writer module. XML::Writer +has an easy to use interface and performs many checks to make sure +that the XML you generate is well-formed. + +C is a subclass of C. +C can be further subclassed to alter it's +behavior or to add element-specific handling. In the subclass, each +time an element starts, a method by that name prefixed with `s_' is +called with the element to be processed. Each time an element ends, a +method with that name prefixed with `e_' is called. Any special +characters in the element name are replaced by underscores. If there +isn't a start or end method for an element, the default action is to +write the start or end tag. Start and end methods can use the +`C' and `C' methods to +print start or end tags. Subclasses can call the `C' method +to write additional output. + +Subclassing XML::Handler::XMLWriter in this way is similar to +XML::Parser's Stream style. + +XML::Handler::Subs maintains a stack of element names, +`C<$self->{Names}', and a stack of element nodes, `C<$self->{Nodes}>' +that can be used by subclasses. The current element is pushed on the +stacks before calling an element-name start method and popped off the +stacks after calling the element-name end method. + +See XML::Handler::Subs for additional methods. + +In addition to the standard PerlSAX handler methods (see PerlSAX for +descriptions), XML::Handler::XMLWriter supports the following methods: + +=over 4 + +=item new( I ) + +Creates and returns a new instance of XML::Handler::XMLWriter with the +given I. Options may be changed at any time by modifying +them directly in the hash returned. I can be a list of key, +value pairs or a hash. The following I are supported: + +=over 4 + +=item Output + +An IO::Handle or one of it's subclasses (such as IO::File), if this +parameter is not present and the AsString option is not used, the +module will write to standard output. + +=item AsString + +Return the generated XML as a string from the `C' method of +the PerlSAX event generator. + +=item Newlines + +A true or false value; if this parameter is present and its value is +true, then the module will insert an extra newline before the closing +delimiter of start, end, and empty tags to guarantee that the document +does not end up as a single, long line. If the paramter is not +present, the module will not insert the newlines. + +=item IsSGML + +A true or false value; if this parameter is present and its value is +true, then the module will generate SGML rather than XML. + +=back + +=item print_start_element($element) + +Print a start tag for `C<$element>'. This is the default action for +the PerlSAX `C' handler, but subclasses may use this +if they define a start method for an element. + +=item print_end_element($element) + +Prints an end tag for `C<$element>'. This is the default action for +the PerlSAX `C' handler, but subclasses may use this +if they define a start method for an element. + +=item print($output) + +Write `C<$output>' to Output and/or append it to the string to be +returned. Subclasses may use this to write additional output. + +=back + +=head1 TODO + +=over 4 + +=item * + +An Elements option that provides finer control over newlines than the +Newlines option, where you can choose before and after newline for +element start and end tags. Inspired by the Python XMLWriter. + +=item * + +Support Doctype and XML declarations. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us +This module is partially derived from XML::Writer by David Megginson. + +=head1 SEE ALSO + +perl(1), PerlSAX.pod(3) + +=cut diff --git a/lib/XML/Parser/PerlSAX.pm b/lib/XML/Parser/PerlSAX.pm new file mode 100644 index 0000000..131a84d --- /dev/null +++ b/lib/XML/Parser/PerlSAX.pm @@ -0,0 +1,796 @@ +# +# Copyright (C) 1999 Ken MacLeod +# XML::Parser::PerlSAX is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: PerlSAX.pm,v 1.7 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::Parser::PerlSAX; + +use XML::Parser; +use UNIVERSAL; +use vars qw{ $VERSION $name_re }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +# FIXME I doubt this is a correct Perl RE for productions [4] and +# [5] in the XML 1.0 specification, especially considering Unicode chars +$name_re = '[A-Za-z_:][A-Za-z0-9._:-]*'; + +sub new { + my $type = shift; + my $self = (@_ == 1) ? shift : { @_ }; + + return bless $self, $type; +} + +sub parse { + my $self = shift; + + die "XML::Parser::PerlSAX: parser instance ($self) already parsing\n" + if (defined $self->{ParseOptions}); + + # If there's one arg and it has no ref, it's a string + my $args; + if (scalar (@_) == 1 && !ref($_[0])) { + $args = { Source => { String => shift } }; + } else { + $args = (scalar (@_) == 1) ? shift : { @_ }; + } + + my $parse_options = { %$self, %$args }; + $self->{ParseOptions} = $parse_options; + + # ensure that we have at least one source + if (!defined $parse_options->{Source} + || !(defined $parse_options->{Source}{String} + || defined $parse_options->{Source}{ByteStream} + || defined $parse_options->{Source}{SystemId})) { + die "XML::Parser::PerlSAX: no source defined for parse\n"; + } + + # assign default Handler to any undefined handlers + if (defined $parse_options->{Handler}) { + $parse_options->{DocumentHandler} = $parse_options->{Handler} + if (!defined $parse_options->{DocumentHandler}); + $parse_options->{DTDHandler} = $parse_options->{Handler} + if (!defined $parse_options->{DTDHandler}); + $parse_options->{EntityResolver} = $parse_options->{Handler} + if (!defined $parse_options->{EntityResolver}); + } + + my @handlers; + if (defined $parse_options->{DocumentHandler}) { + # cache DocumentHandler in self for callbacks + $self->{DocumentHandler} = $parse_options->{DocumentHandler}; + + my $doc_h = $parse_options->{DocumentHandler}; + + push (@handlers, Init => sub { $self->_handle_init(@_) } ) + if (UNIVERSAL::can($doc_h, 'start_document')); + push (@handlers, Final => sub { $self->_handle_final(@_) } ) + if (UNIVERSAL::can($doc_h, 'end_document')); + push (@handlers, Start => sub { $self->_handle_start(@_) } ) + if (UNIVERSAL::can($doc_h, 'start_element')); + push (@handlers, End => sub { $self->_handle_end(@_) } ) + if (UNIVERSAL::can($doc_h, 'end_element')); + push (@handlers, Char => sub { $self->_handle_char(@_) } ) + if (UNIVERSAL::can($doc_h, 'characters')); + push (@handlers, Proc => sub { $self->_handle_proc(@_) } ) + if (UNIVERSAL::can($doc_h, 'processing_instruction')); + push (@handlers, Comment => sub { $self->_handle_comment(@_) } ) + if (UNIVERSAL::can($doc_h, 'comment')); + push (@handlers, CdataStart => sub { $self->_handle_cdatastart(@_) } ) + if (UNIVERSAL::can($doc_h, 'start_cdata')); + push (@handlers, CdataEnd => sub { $self->_handle_cdataend(@_) } ) + if (UNIVERSAL::can($doc_h, 'end_cdata')); + if (UNIVERSAL::can($doc_h, 'entity_reference')) { + push (@handlers, Default => sub { $self->_handle_default(@_) } ); + $self->{UseEntRefs} = 1; + } + } + + if (defined $parse_options->{DTDHandler}) { + # cache DTDHandler in self for callbacks + $self->{DTDHandler} = $parse_options->{DTDHandler}; + + my $dtd_h = $parse_options->{DTDHandler}; + + push (@handlers, Notation => sub { $self->_handle_notation(@_) } ) + if (UNIVERSAL::can($dtd_h, 'notation_decl')); + push (@handlers, Unparsed => sub { $self->_handle_unparsed(@_) } ) + if (UNIVERSAL::can($dtd_h, 'unparsed_entity_decl')); + push (@handlers, Entity => sub { $self->_handle_entity(@_) } ) + if ($self->{UseEntRefs} + || UNIVERSAL::can($dtd_h, 'entity_decl')); + push (@handlers, Element => sub { $self->_handle_element(@_) } ) + if (UNIVERSAL::can($dtd_h, 'element_decl')); + push (@handlers, Attlist => sub { $self->_handle_attlist(@_) } ) + if (UNIVERSAL::can($dtd_h, 'attlist_decl')); + push (@handlers, Doctype => sub { $self->_handle_doctype(@_) } ) + if (UNIVERSAL::can($dtd_h, 'doctype_decl')); + push (@handlers, XMLDecl => sub { $self->_handle_xmldecl(@_) } ) + if (UNIVERSAL::can($dtd_h, 'xml_decl')); + } + + + if (defined $parse_options->{EntityResolver}) { + # cache EntityResolver in self for callbacks + $self->{EntityResolver} = $parse_options->{EntityResolver}; + + my $er = $parse_options->{EntityResolver}; + + push (@handlers, ExternEnt => sub { $self->_handle_extern_ent(@_) } ) + if (UNIVERSAL::can($er, 'resolve_entity')); + } + + my @xml_parser_options; + if ($self->{UseEntRefs}) { + @xml_parser_options = ( NoExpand => 1, + Handlers => { @handlers } ); + } else { + @xml_parser_options = ( Handlers => { @handlers } ); + } + + push (@xml_parser_options, + ProtocolEncoding => $self->{ParseOptions}{Source}{Encoding}) + if (defined $self->{ParseOptions}{Source}{Encoding}); + + my $parser = new XML::Parser(@xml_parser_options); + my $result; + + if (defined $self->{ParseOptions}{Source}{ByteStream}) { + $result = $parser->parse($self->{ParseOptions}{Source}{ByteStream}); + } elsif (defined $self->{ParseOptions}{Source}{String}) { + $result = $parser->parse($self->{ParseOptions}{Source}{String}); + } elsif (defined $self->{ParseOptions}{Source}{SystemId}) { + $result = $parser->parsefile($self->{ParseOptions}{Source}{SystemId}); + } + + # clean up parser instance + delete $self->{ParseOptions}; + delete $self->{DocumentHandler}; + delete $self->{DTDHandler}; + delete $self->{EntityResolver}; + delete $self->{Expat}; + + return $result; +} + +sub location { + my $self = shift; + + my $expat = $self->{Expat}; + + my @properties = ( ColumnNumber => $expat->current_column, + LineNumber => $expat->current_line, + BytePosition => $expat->current_byte, + Base => $expat->base ); + + # FIXME these locations change while parsing external entities + push (@properties, PublicId => $self->{Source}{PublicId}) + if (defined $self->{Source}{PublicId}); + push (@properties, SystemId => $self->{Source}{SystemId}) + if (defined $self->{Source}{SystemId}); + + return { @properties }; +} + +### +### DocumentHandler methods +### + +sub _handle_init { + my $self = shift; + my $expat = shift; + + $self->{Expat} = $expat; + + if ($self->{DocumentHandler}->can('set_document_locator')) { + $self->{DocumentHandler}->set_document_locator( { Locator => $self } ); + } + $self->{DocumentHandler}->start_document( { } ); +} + +sub _handle_final { + my $self = shift; + + delete $self->{UseEntRefs}; + delete $self->{EntRefs}; + return $self->{DocumentHandler}->end_document( { } ); +} + +sub _handle_start { + my $self = shift; + my $expat = shift; + my $element = shift; + + my @properties; + if ($self->{ParseOptions}{UseAttributeOrder}) { + # Capture order and defined() status for attributes + my $ii; + + my $order = []; + for ($ii = 0; $ii < $#_; $ii += 2) { + push @$order, $_[$ii]; + } + + push @properties, 'AttributeOrder', $order; + + # Divide by two because XML::Parser counts both attribute name + # and value within it's index + push @properties, 'Defaulted', ($expat->specified_attr() / 2); + } + + $self->{DocumentHandler}->start_element( { Name => $element, + Attributes => { @_ }, + @properties } ); +} + +sub _handle_end { + my $self = shift; + my $expat = shift; + my $element = shift; + + $self->{DocumentHandler}->end_element( { Name => $element } ); +} + +sub _handle_char { + my $self = shift; + my $expat = shift; + my $string = shift; + + $self->{DocumentHandler}->characters( { Data => $string } ); +} + +sub _handle_proc { + my $self = shift; + my $expat = shift; + my $target = shift; + my $data = shift; + + $self->{DocumentHandler}->processing_instruction( { Target => $target, + Data => $data } ); +} + +sub _handle_comment { + my $self = shift; + my $expat = shift; + my $data = shift; + + $self->{DocumentHandler}->comment( { Data => $data } ); +} + +sub _handle_cdatastart { + my $self = shift; + my $expat = shift; + + $self->{DocumentHandler}->start_cdata( { } ); +} + +sub _handle_cdataend { + my $self = shift; + my $expat = shift; + + $self->{DocumentHandler}->end_cdata( { } ); +} + +# Default receives all characters that aren't handled by some other +# handler, this means a lot of stuff goes through here. All we're +# looking for are `&NAME;' entity reference sequences +sub _handle_default { + my $self = shift; + my $expat = shift; + my $string = shift; + + if ($string =~ /^&($name_re);$/) { + my $ent_ref = $self->{EntRefs}{$1}; + if (!defined $ent_ref) { + $ent_ref = { Name => $1 }; + } + $self->{DocumentHandler}->entity_reference($ent_ref); + } +} + +### +### DTDHandler methods +### + +sub _handle_notation { + my $self = shift; + my $expat = shift; + my $notation = shift; + my $base = shift; + my $sysid = shift; + my $pubid = shift; + my @properties = (Name => $notation); + + push (@properties, Base => $base) + if (defined $base); + push (@properties, SystemId => $sysid) + if (defined $sysid); + push (@properties, PublicId => $pubid) + if (defined $pubid); + + + $self->{DTDHandler}->notation_decl( { @properties } ); +} + +sub _handle_unparsed { + my $self = shift; + my $expat = shift; + my $entity = shift; + my $base = shift; + my $sysid = shift; + my $pubid = shift; + my @properties = (Name => $entity, SystemId => $sysid); + + push (@properties, Base => $base) + if (defined $base); + push (@properties, PublicId => $pubid) + if (defined $pubid); + + $self->{DTDHandler}->unparsed_entity_decl( { @properties } ); +} + +sub _handle_entity { + my $self = shift; + my $expat = shift; + my $name = shift; + my $val = shift; + my $sysid = shift; + my $pubid = shift; + my $ndata = shift; + my @properties = (Name => $name); + + push (@properties, Value => $val) + if (defined $val); + push (@properties, PublicId => $pubid) + if (defined $pubid); + push (@properties, SystemId => $sysid) + if (defined $sysid); + push (@properties, Notation => $ndata) + if (defined $ndata); + + my $properties = { @properties }; + if ($self->{UseEntRefs}) { + $self->{EntRefs}{$name} = $properties; + } + if ($self->{DTDHandler}->can('entity_decl')) { + $self->{DTDHandler}->entity_decl( $properties ); + } +} + +sub _handle_element { + my $self = shift; + my $expat = shift; + my $name = shift; + my $model = shift; + + $self->{DTDHandler}->element_decl( { Name => $name, + Model => $model } ); +} + +sub _handle_attlist { + my $self = shift; + my $expat = shift; + my $elname = shift; + my $attname = shift; + my $type = shift; + my $default = shift; + my $fixed = shift; + + $self->{DTDHandler}->attlist_decl( { ElementName => $elname, + AttributeName => $attname, + Type => $type, + Default => $default, + Fixed => $fixed } ); +} + +sub _handle_doctype { + my $self = shift; + my $expat = shift; + my $name = shift; + my $sysid = shift; + my $pubid = shift; + my $internal = shift; + my @properties = (Name => $name); + + push (@properties, SystemId => $sysid) + if (defined $sysid); + push (@properties, PublicId => $pubid) + if (defined $pubid); + push (@properties, Internal => $internal) + if (defined $internal); + + $self->{DTDHandler}->doctype_decl( { @properties } ); +} + +sub _handle_xmldecl { + my $self = shift; + my $expat = shift; + my $version = shift; + my $encoding = shift; + my $standalone = shift; + my @properties = (Version => $version); + + push (@properties, Encoding => $encoding) + if (defined $encoding); + push (@properties, Standalone => $standalone) + if (defined $standalone); + + $self->{DTDHandler}->xml_decl( { @properties } ); +} + +### +### EntityResolver methods +### + +sub _handle_extern_ent { + my $self = shift; + my $expat = shift; + my $base = shift; + my $sysid = shift; + my $pubid = shift; + my @properties = (SystemId => $sysid); + + push (@properties, Base => $base) + if (defined $base); + push (@properties, PublicId => $pubid) + if (defined $pubid); + + my $result = $self->{EntityResolver}->resolve_entity( { @properties } ); + + if (UNIVERSAL::isa($result, 'HASH')) { + if ($result->{ByteStream}) { + return $result->{ByteStream}; + } elsif ($result->{String}) { + return $result->{String}; + } elsif ($result->{SystemId}) { + # FIXME must be able to resolve SystemIds, XML::Parser's + # default can :-( + die "PerlSAX: automatic opening of SystemIds from \`resolve_entity' not implemented, contact the author\n"; + } else { + # FIXME + die "PerlSAX: invalid source returned from \`resolve_entity'\n"; + } + } + + return undef; +} + +1; + +__END__ + +=head1 NAME + +XML::Parser::PerlSAX - Perl SAX parser using XML::Parser + +=head1 SYNOPSIS + + use XML::Parser::PerlSAX; + + $parser = XML::Parser::PerlSAX->new( [OPTIONS] ); + $result = $parser->parse( [OPTIONS] ); + + $result = $parser->parse($string); + +=head1 DESCRIPTION + +C is a PerlSAX parser using the XML::Parser +module. This man page summarizes the specific options, handlers, and +properties supported by C; please refer to the +PerlSAX standard in `C' for general usage information. + +=head1 METHODS + +=over 4 + +=item new + +Creates a new parser object. Default options for parsing, described +below, are passed as key-value pairs or as a single hash. Options may +be changed directly in the parser object unless stated otherwise. +Options passed to `C' override the default options in the +parser object for the duration of the parse. + +=item parse + +Parses a document. Options, described below, are passed as key-value +pairs or as a single hash. Options passed to `C' override +default options in the parser object. + +=item location + +Returns the location as a hash: + + ColumnNumber The column number of the parse. + LineNumber The line number of the parse. + BytePosition The current byte position of the parse. + PublicId A string containing the public identifier, or undef + if none is available. + SystemId A string containing the system identifier, or undef + if none is available. + Base The current value of the base for resolving relative + URIs. + +ALPHA WARNING: The `C' and `C' properties returned +are the system and public identifiers of the document passed to +`C', not the identifiers of the currently parsing external +entity. The column, line, and byte positions I of the current +entity being parsed. + +=head1 OPTIONS + +The following options are supported by C: + + Handler default handler to receive events + DocumentHandler handler to receive document events + DTDHandler handler to receive DTD events + ErrorHandler handler to receive error events + EntityResolver handler to resolve entities + Locale locale to provide localisation for errors + Source hash containing the input source for parsing + UseAttributeOrder set to true to provide AttributeOrder and Defaulted + properties in `start_element()' + +If no handlers are provided then all events will be silently ignored, +except for `C' which will cause a `C' to be +called after calling `C'. + +If a single string argument is passed to the `C' method, it +is treated as if a `C' option was given with a `C' +parameter. + +The `C' hash may contain the following parameters: + + ByteStream The raw byte stream (file handle) containing the + document. + String A string containing the document. + SystemId The system identifier (URI) of the document. + PublicId The public identifier. + Encoding A string describing the character encoding. + +If more than one of `C', `C', or `C', +then preference is given first to `C', then `C', +then `C'. + +=head1 HANDLERS + +The following handlers and properties are supported by +C: + +=head2 DocumentHandler methods + +=over 4 + +=item start_document + +Receive notification of the beginning of a document. + +No properties defined. + +=item end_document + +Receive notification of the end of a document. + +No properties defined. + +=item start_element + +Receive notification of the beginning of an element. + + Name The element type name. + Attributes A hash containing the attributes attached to the + element, if any. + +The `C' hash contains only string values. + +If the `C' parser option is true, the following +properties are also passed to `C': + + AttributeOrder An array of attribute names in the order they were + specified, followed by the defaulted attribute + names. + Defaulted The index number of the first defaulted attribute in + `AttributeOrder. If this index is equal to the + length of `AttributeOrder', there were no defaulted + values. + +Note to C users: `C' will be half the value of +C's `C' function because only +attribute names are provided, not their values. + + +=item end_element + +Receive notification of the end of an element. + + Name The element type name. + +=item characters + +Receive notification of character data. + + Data The characters from the XML document. + +=item processing_instruction + +Receive notification of a processing instruction. + + Target The processing instruction target. + Data The processing instruction data, if any. + +=item comment + +Receive notification of a comment. + + Data The comment data, if any. + +=item start_cdata + +Receive notification of the start of a CDATA section. + +No properties defined. + +=item end_cdata + +Receive notification of the end of a CDATA section. + +No properties defined. + +=item entity_reference + +Receive notification of an internal entity reference. If this handler +is defined, internal entities will not be expanded and not passed to +the `C' handler. If this handler is not defined, +internal entities will be expanded if possible and passed to the +`C' handler. + + Name The entity reference name + Value The entity reference value + +=back + +=head2 DTDHandler methods + +=over 4 + +=item notation_decl + +Receive notification of a notation declaration event. + + Name The notation name. + PublicId The notation's public identifier, if any. + SystemId The notation's system identifier, if any. + Base The base for resolving a relative URI, if any. + +=item unparsed_entity_decl + +Receive notification of an unparsed entity declaration event. + + Name The unparsed entity's name. + SystemId The entity's system identifier. + PublicId The entity's public identifier, if any. + Base The base for resolving a relative URI, if any. + +=item entity_decl + +Receive notification of an entity declaration event. + + Name The entity name. + Value The entity value, if any. + PublicId The notation's public identifier, if any. + SystemId The notation's system identifier, if any. + Notation The notation declared for this entity, if any. + +For internal entities, the `C' parameter will contain the value +and the `C', `C', and `C' will be +undefined. For external entities, the `C' parameter will be +undefined, the `C' parameter will have the system id, the +`C' parameter will have the public id if it was provided (it +will be undefined otherwise), the `C' parameter will contain +the notation name for unparsed entities. If this is a parameter entity +declaration, then a '%' will be prefixed to the entity name. + +Note that `C' and `C' overlap. +If both methods are implemented by a handler, then this handler will +not be called for unparsed entities. + +=item element_decl + +Receive notification of an element declaration event. + + Name The element type name. + Model The content model as a string. + +=item attlist_decl + +Receive notification of an attribute list declaration event. + +This handler is called for each attribute in an ATTLIST declaration +found in the internal subset. So an ATTLIST declaration that has +multiple attributes will generate multiple calls to this handler. + + ElementName The element type name. + AttributeName The attribute name. + Type The attribute type. + Fixed True if this is a fixed attribute. + +The default for `C' is the default value, which will either be +"#REQUIRED", "#IMPLIED" or a quoted string (i.e. the returned string +will begin and end with a quote character). + +=item doctype_decl + +Receive notification of a DOCTYPE declaration event. + + Name The document type name. + SystemId The document's system identifier. + PublicId The document's public identifier, if any. + Internal The internal subset as a string, if any. + +Internal will contain all whitespace, comments, processing +instructions, and declarations seen in the internal subset. The +declarations will be there whether or not they have been processed by +another handler (except for unparsed entities processed by the +Unparsed handler). However, comments and processing instructions will +not appear if they've been processed by their respective handlers. + +=item xml_decl + +Receive notification of an XML declaration event. + + Version The version. + Encoding The encoding string, if any. + Standalone True, false, or undefined if not declared. + +=back + +=head2 EntityResolver + +=over 4 + +=item resolve_entity + +Allow the handler to resolve external entities. + + Name The notation name. + SystemId The notation's system identifier. + PublicId The notation's public identifier, if any. + Base The base for resolving a relative URI, if any. + +`C' should return undef to request that the parser +open a regular URI connection to the system identifier or a hash +describing the new input source. This hash has the same properties as +the `C' parameter to `C': + + PublicId The public identifier of the external entity being + referenced, or undef if none was supplied. + SystemId The system identifier of the external entity being + referenced. + String String containing XML text + ByteStream An open file handle. + CharacterStream + An open file handle. + Encoding The character encoding, if known. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), PerlSAX.pod(3) + + Extensible Markup Language (XML) + SAX 1.0: The Simple API for XML + +=cut diff --git a/lib/XML/PatAct/ActionTempl.pm b/lib/XML/PatAct/ActionTempl.pm new file mode 100644 index 0000000..91e1380 --- /dev/null +++ b/lib/XML/PatAct/ActionTempl.pm @@ -0,0 +1,146 @@ +# This template file is in the Public Domain. +# You may do anything you want with this file. +# +# $Id: ActionTempl.pm,v 1.2 1999/08/16 16:04:03 kmacleod Exp $ +# + +# replace all occurrences of ACTION with the name of your module! + +use strict; + +use UNIVERSAL; + +package XML::PatAct::ACTION; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + bless $self, $type; + + my $usage = <<'EOF'; +usage: XML::PatAct::ACTION->new( Matcher => $matcher, + Patterns => $patterns ); +EOF + + die "No Matcher specified\n$usage\n" + if !defined $self->{Matcher}; + die "No Patterns specified\n$usage\n" + if !defined $self->{Patterns}; + + # perform additional initialization here + + return $self; +} + +sub start_document { + my ($self, $document) = @_; + + # initialize the pattern module at the start of a document + $self->{Matcher}->initialize($self); + + # create empty name and node lists for passing to `match()' + $self->{Names} = [ ]; + $self->{Nodes} = [ ]; + + # Knowing that a source is a tree can be useful information + $self->{SourceIsGrove} = UNIVERSAL::isa($document, 'Data::Grove'); +} + +sub end_document { + my ($self, $document) = @_; + + # notify the pattern module that we're done + $self->{Matcher}->finalize(); + + my $value; + # perform any finalization actions, use $value to return a result + # from calling `parse()' + + # release all the info that is just used during event handling + $self->{Matcher} = $self->{Names} = $self->{Nodes} = undef; + $self->{SourceIsGrove} = undef; + + return $value; +} + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{Names}}, $element->{Name}; + push @{$self->{Nodes}}, $element; + + my $index = $self->{Matcher}->match($element, + $self->{Names}, + $self->{Nodes}); + + # use $index to retrieve an action for this element +} + +sub end_element { + my ($self, $end_element) = @_; + + my $name = pop @{$self->{Names}}; + my $element = pop @{$self->{Nodes}}; + + # perform any finishing steps at the end of an element +} + +sub characters { + my ($self, $characters) = @_; + +} + +sub processing_instruction { + my ($self, $pi) = @_; + +} + +sub ignorable_whitespace { + my ($self, $characters) = @_; + +} + +1; + +__END__ + +=head1 NAME + +XML::PatAct::ACTION - An action module for + +=head1 SYNOPSIS + + use XML::PatAct::ACTION; + + my $patterns = [ PATTERN => ACTION, + ... ]; + + my $matcher = XML::PatAct::ACTION->new(Patterns => $patterns, + Matcher => $matcher ); + + +=head1 DESCRIPTION + +XML::PatAct::ACTION is a PerlSAX handler for applying pattern-action +lists to XML parses or trees. XML::PatAct::ACTION ... + +New XML::PatAct::ACTION instances are creating by calling `new()'. A +Parameters can be passed as a list of key, value pairs or a hash. +Patterns and Matcher options are required. Patterns is the +pattern-action list to apply. Matcher is an instance of the pattern +or query matching module. + +DESCRIBE THE FORMAT OF YOUR ACTIONS HERE + +=head1 AUTHOR + +This template file was written by Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1) + +``Using PatAct Modules'' and ``Creating PatAct Modules'' in libxml-perl. + +=cut diff --git a/lib/XML/PatAct/Amsterdam.pm b/lib/XML/PatAct/Amsterdam.pm new file mode 100644 index 0000000..d6f948f --- /dev/null +++ b/lib/XML/PatAct/Amsterdam.pm @@ -0,0 +1,234 @@ +# +# Copyright (C) 1999 Ken MacLeod +# XML::PatAct::Amsterdam is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Amsterdam.pm,v 1.4 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +use UNIVERSAL; + +package XML::PatAct::Amsterdam; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + bless $self, $type; + + my $usage = <<'EOF'; +usage: XML::PatAct::Amsterdam->new( Matcher => $matcher, + Patterns => $patterns ); +EOF + + die "No Matcher specified\n$usage\n" + if !defined $self->{Matcher}; + die "No Patterns specified\n$usage\n" + if !defined $self->{Patterns}; + + # perform additional initialization here + + return $self; +} + +sub start_document { + my ($self, $document) = @_; + + # initialize the pattern module at the start of a document + $self->{Matcher}->initialize($self); + + # create empty name and node lists for passing to `match()' + $self->{Names} = [ ]; + $self->{Nodes} = [ ]; + + $self->{ActionStack} = [ ]; + + # create a temporary Output_ in case we're creating a standard + # output file that we'll delete later. + if (!$self->{AsString} && !defined($self->{Output})) { + require IO::File; + import IO::File; + $self->{Output_} = new IO::File(">-"); + } elsif (defined($self->{Output})) { + $self->{Output_} = $self->{Output}; + } + + if ($self->{AsString}) { + $self->{Strings} = []; + } +} + +sub end_document { + my ($self, $document) = @_; + + # notify the pattern module that we're done + $self->{Matcher}->finalize(); + + if (defined($self->{Output_})) { + delete $self->{Output_}; + } + + my $string = undef; + if (defined($self->{AsString})) { + $string = join('', @{$self->{Strings}}); + delete $self->{Strings}; + } + + # release all the info that is just used during event handling + $self->{Matcher} = $self->{Names} = $self->{Nodes} = undef; + $self->{ActionStack} = undef; + + return($string); +} + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{Names}}, $element->{Name}; + push @{$self->{Nodes}}, $element; + + my $index = $self->{Matcher}->match($element, + $self->{Names}, + $self->{Nodes}); + + my $action; + if (!defined $index) { + $action = undef; + } else { + $action = $self->{Patterns}[$index * 2 + 1]; + } + + push @{$self->{ActionStack}}, $action; + + if (defined($action)) { + my $before = $action->{Before}; + if (defined $before) { + my $atts = $element->{Attributes}; + $before =~ s/\[([\w.:]+)\]/ + ($1 eq '_element') ? $element->{Name} : $atts->{$1} + /eg; + $self->print($before); + } + } +} + +sub end_element { + my ($self, $end_element) = @_; + + my $name = pop @{$self->{Names}}; + my $element = pop @{$self->{Nodes}}; + + my $action = pop @{$self->{ActionStack}}; + + if (defined($action)) { + my $after = $action->{After}; + if (defined $after) { + my $atts = $element->{Attributes}; + $after =~ s/\[([\w.:]+)\]/ + ($1 eq '_element') ? $element->{Name} : $atts->{$1} + /eg; + $self->print($after); + } + } +} + +sub characters { + my ($self, $characters) = @_; + + $self->print($characters->{Data}); +} + +sub print { + my ($self, $output) = @_; + + $self->{Output_}->print($output) + if (defined($self->{Output_})); + + push(@{$self->{Strings}}, $output) + if (defined($self->{AsString})); +} + +1; + +__END__ + +=head1 NAME + +XML::PatAct::Amsterdam - An action module for simplistic style-sheets + +=head1 SYNOPSIS + + use XML::PatAct::Amsterdam; + + my $patterns = [ PATTERN => { Before => 'before', + After => 'after' }, + ... ]; + + my $matcher = XML::PatAct::Amsterdam->new( I ); + + +=head1 DESCRIPTION + +XML::PatAct::Amsterdam is a PerlSAX handler for applying +pattern-action lists to XML parses or trees. XML::PatAct::Amsterdam +applies a very simple style sheet to an instance and outputs the +result. Amsterdam gets it's name from the Amsterdam SGML Parser (ASP) +which inspired this module. + +CAUTION: Amsterdam is a very simple style module, you will run into +it's limitations quickly with even moderately complex XML instances, +be aware of and prepared to switch to more complete style modules. + +New XML::PatAct::Amsterdam instances are creating by calling `new()'. +Parameters can be passed as a list of key, value pairs or a hash. A +Patterns and Matcher options are required. The following I +are supported: + +=over 4 + +=item Patterns + +The pattern-action list to apply. The list is an anonymous array of +pattern, action pairs. Each action in the list contains either or +both a Before and an After string to copy to the output before and +after processing an XML element. The Before and After strings may +contain attribute names enclosed in square brackets (`C<[>' I +`C<]>'), these are replaced with the value of the attribute with that +name. The special I `C<_element>' will be replaced with the +element's name. + +=item Matcher + +An instance of the pattern or query matching module. + +=item Output + +An IO::Handle or one of it's subclasses (such as IO::File), if this +parameter is not present and the AsString option is not used, the +module will write to standard output. + +=item AsString + +Return the generated output as a string from the `C' method +of the PerlSAX event generator. + +=back + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1) + +``Using PatAct Modules'' and ``Creating PatAct Modules'' in libxml-perl. + +=cut diff --git a/lib/XML/PatAct/MatchName.pm b/lib/XML/PatAct/MatchName.pm new file mode 100644 index 0000000..31cc558 --- /dev/null +++ b/lib/XML/PatAct/MatchName.pm @@ -0,0 +1,99 @@ +# +# Copyright (C) 1999 Ken MacLeod +# XML::PatAct::MatchName is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: MatchName.pm,v 1.3 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::PatAct::MatchName; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + return bless $self, $type; +} + +# This is functionally equivalent to PerlSAX `start_document()' +sub initialize { + my ($self, $driver) = @_; + $self->{Driver} = $driver; +} + +# This is functionally equivalent to PerlSAX `end_document()' +sub finalize { + my $self = shift; + + $self->{Driver} = undef; +} + +# This is functionally equivalent to a PerlSAX `start_element()' +sub match { + my ($self, $element, $names, $nodes) = @_; + + my $names_path = '/' . join('/', @$names); + my $patterns = $self->{Patterns}; + my $ii = 0; + while ($ii <= $#$patterns) { + my $pattern = $patterns->[$ii]; + if ($names_path =~ m|/$pattern$|) { + return $ii / 2; + } + $ii += 2; + } + + return undef; +} + +1; + +__END__ + +=head1 NAME + +XML::PatAct::MatchName - A pattern module for matching element names + +=head1 SYNOPSIS + + use XML::PatAct::MatchName; + + my $matcher = XML::PatAct::MatchName->new(); + + my $patterns = [ 'foo' => ACTION, + 'bar/foo' => ACTION, + ... ]; + +=head1 DESCRIPTION + +XML::PatAct::MatchName is a pattern module for use with PatAct drivers +for applying pattern-action lists to XML parses or trees. +XML::PatAct::MatchName is a simple pattern module that uses just +element names to match on. If multiple names are supplied seperated +by `C' characters, then all of the parent element names must match +as well. + +The order of patterns in the list is not significant. +XML::PatAct::MatchName will use the most specific match. Using the +synopsis above as an example, if you have an element `C', +`C' will match if `C' is in an element `C', +otherwise just the pattern with `C' will match. + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1) + +``Using PatAct Modules'' and ``Creating PatAct Modules'' in libxml-perl. + +=cut diff --git a/lib/XML/PatAct/PatternTempl.pm b/lib/XML/PatAct/PatternTempl.pm new file mode 100644 index 0000000..7cc09ba --- /dev/null +++ b/lib/XML/PatAct/PatternTempl.pm @@ -0,0 +1,82 @@ +# This template file is in the Public Domain. +# You may do anything you want with this file. +# +# $Id: PatternTempl.pm,v 1.2 1999/08/16 16:04:03 kmacleod Exp $ +# + +# replace all occurrences of PATTERN with the name of your module! + +use strict; + +package XML::PatAct::PATTERN; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + # perform any one-time initializations + + return bless $self, $type; +} + +sub initialize { + my ($self, $driver) = @_; + $self->{Driver} = $driver; + + # perform initializations for each XML instance +} + +sub finalize { + my $self = shift; + + # clean up any state information + + $self->{Driver} = undef; +} + +sub match { + my ($self, $element, $names, $nodes) = @_; + + # Use the Patterns list to match a pattern + + return undef; +} + +1; + +__END__ + +=head1 NAME + +XML::PatAct::PATTERN - A pattern module for + +=head1 SYNOPSIS + + use XML::PatAct::PATTERN; + + my $patterns = [ PATTERN => ACTION, + ... ] + + my $matcher = XML::PatAct::PATTERN->new( Patterns => $patterns ); + +=head1 DESCRIPTION + +XML::PatAct::PATTERN is a pattern module for use with PatAct action +modules for applying pattern-action lists to XML parses or trees. +XML::PatAct::PATTERN ... + +Parameters can be passed as a list of key, value pairs or a hash. + +DESCRIBE THE FORMAT OR LANGUAGE OF YOUR PATTERNS HERE + +=head1 AUTHOR + +This template file was written by Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1) + +``Using PatAct Modules'' and ``Creating PatAct Modules'' in libxml-perl. + +=cut diff --git a/lib/XML/PatAct/ToObjects.pm b/lib/XML/PatAct/ToObjects.pm new file mode 100644 index 0000000..8aedd21 --- /dev/null +++ b/lib/XML/PatAct/ToObjects.pm @@ -0,0 +1,576 @@ +# +# Copyright (C) 1999 Ken MacLeod +# XML::PatAct::ToObjects is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: ToObjects.pm,v 1.5 1999/12/22 21:15:00 kmacleod Exp $ +# + +# The original XML::Grove::ToObjects actually generated and compiled a +# sub for matching actions, possibly a performance improvement of three +# or four times over all the comparisons made in start_element() and +# end_element(). + +use strict; + +use UNIVERSAL; + +package XML::PatAct::ToObjects; +use vars qw{ $VERSION $name_re }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +# FIXME I doubt this is a correct Perl RE for productions [4] and +# [5] in the XML 1.0 specification, especially considering Unicode chars +$name_re = '[A-Za-z_:][A-Za-z0-9._:-]*'; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ }; + + bless $self, $type; + + my $usage = <<'EOF'; +usage: XML::PatAct::ToObjects->new( Matcher => $matcher, + Patterns => $patterns ); +EOF + + die "No Matcher specified\n$usage\n" + if !defined $self->{Matcher}; + die "No Patterns specified\n$usage\n" + if !defined $self->{Patterns}; + + # Parse action items + $self->{Actions} = [ ]; + my $patterns = $self->{Patterns}; + my $ii = 1; + while ($ii <= $#$patterns) { + if (ref $patterns->[$ii]) { + push @{$self->{Actions}}, + $self->_parse_action($patterns->[$ii]); + } else { + # is a code fragment + } + $ii += 2; + } + + if (defined $self->{GroveBuilder}) { + require XML::Grove::Builder; + import XML::Grove::Builder; + $self->{GroveBuilder} = XML::Grove::Builder->new(); + } + + return $self; +} + +sub start_document { + my ($self, $document) = @_; + + $self->{Matcher}->initialize($self); + $self->{Parents} = [ { Contents => [ ] } ]; + $self->{ActionStack} = [ ]; + $self->{States} = [ 'normal' ]; + $self->{Document} = $document; + $self->{Names} = [ ]; + $self->{Nodes} = [ ]; + $self->{Data} = undef; + $self->{SourceIsGrove} = UNIVERSAL::isa($document, 'Data::Grove'); + if (!defined $self->{CharacterDataType}) { + require Data::Grove; + import Data::Grove; + $self->{CharacterDataType} = 'Data::Grove::Characters'; + } +} + +sub end_document { + my ($self, $document) = @_; + + $self->{Matcher}->finalize(); + # FIXME check to make sure no other fields were assigned to + my $value = $self->{Parents}[0]{Contents}; + + # release all the info that is just used during event handling + $self->{Matcher} = $self->{Parents} = $self->{ActionStack} = undef; + $self->{States} = $self->{Document} = $self->{Names} = undef; + $self->{Nodes} = $self->{Data} = $self->{SourceIsGrove} = undef; + + return $value; +} + +sub start_element { + my ($self, $element) = @_; + + push @{$self->{Names}}, $element->{Name}; + push @{$self->{Nodes}}, $element; + + my $index = $self->{Matcher}->match($element, + $self->{Names}, + $self->{Nodes}); + + my $action; + if (!defined $index) { + $action = undef; + } else { + $action = $self->{Actions}[$index]; + } + + push @{$self->{ActionStack}}, $action; + + my $state = $self->{States}[-1]; + push @{$self->{States}}, $state; + + if (($state eq 'as-grove') and !$self->{SourceIsGrove}) { + $self->{GroveBuilder}->start_element($element); + } + + return if (($state ne 'normal') && ($state ne 'pcdata')); + + if (defined($action) and defined($action->{PCData})) { + $self->{States}[-1] = 'pcdata'; + } + + if (!defined($action) or $action->{Holder}) { + # ignore this element but continue processing below + return; + } + + if ($action->{Ignore} or $action->{FieldValue}) { + # ignore (discard) this element and it's children + $self->{States}[-1] = 'discarding'; + return; + } + + if ($action->{AsString}) { + $self->{Data} = [ ]; + $self->{States}[-1] = 'as-string'; + return; + } + + if ($action->{AsGrove}) { + $self->{States}[-1] = 'as-grove'; + if (!$self->{SourceIsGrove}) { + $self->{GroveBuilder}->start_document( { } ); + $self->{GroveBuilder}->start_element($element); + } + return; + } + + if (defined $action->{Make}) { + my @args; + if (defined $element->{Attributes}) { + if (defined $self->{CopyAttributes}) { + push @args, %{$element->{Attributes}}; + } elsif ($self->{CopyId} && defined($element->{Attributes}{ID})) { + # FIXME use code from XML::Grove::IDs + push (@args, ID => $element->{Attributes}{ID}); + } + } + + if (defined $action->{Args}) { + eval 'push (@args, (' . $action->{Args} . '))'; + if ($@) { + warn "$@\nwhile processing pattern/action #$index\n"; + } + } + + if ($action->{Make} eq 'HASH') { + push @{$self->{Parents}}, { @args }; + } else { + my $is_defined = 0; + #eval "\$is_defined = defined %{$action->{Make}" . "::}"; + if ($is_defined) { + push @{$self->{Parents}}, $action->{Make}->new( @args ); + } else { + push (@{$self->{Parents}}, + bless ({ @args }, $action->{Make})); + } + } + + if ($action->{ContentsAsGrove}) { + $self->{States}[-1] = 'as-grove'; + if (!$self->{SourceIsGrove}) { + $self->{GroveBuilder}->start_document( { } ); + } + } + + return; + } + + # Place to store all the rest of gathered contents + push (@{$self->{Parents}}, { } ); +} + +sub end_element { + my ($self, $end_element) = @_; + + my $name = pop @{$self->{Names}}; + my $element = pop @{$self->{Nodes}}; + + my $action = pop @{$self->{ActionStack}}; + my $state = pop @{$self->{States}}; + + if ($state eq 'as-grove' and !$self->{SourceIsGrove}) { + $self->{GroveBuilder}->end_element($end_element); + } + + if (!defined($action) or $action->{Holder}) { + return; + } + + if ($action->{Ignore}) { + return; + } + + my $value; + + if ($action->{AsString}) { + $value = join("", @{$self->{Data}}); + } elsif ($action->{AsGrove}) { + if ($self->{SourceIsGrove}) { + $value = $element; + } else { + # get just the root element of the document fragment + $value = $self->{GroveBuilder}->end_document({ })->{Contents}[0]; + } + } elsif (defined $action->{FieldValue}) { + $value = $action->{FieldValue}; + $value =~ s/%\{($name_re)\}/$element->{Attributes}{$1}/ge; + } elsif (defined $action->{Make}) { + $value = pop @{$self->{Parents}}; + if ($action->{ContentsAsGrove}) { + if ($self->{SourceIsGrove}) { + $value->{Contents} = $element->{Contents}; + } else { + $value->{Contents} = + $self->{GroveBuilder}->end_document({ })->{Contents}; + } + } + } else { + $value = pop(@{$self->{Parents}})->{Contents}; + } + + if ($action->{FieldIsArray}) { + push @{$self->{Parents}[-1]{$action->{Field}}}, $value; + } elsif (defined $action->{Field}) { + $self->{Parents}[-1]{$action->{Field}} = $value; + } else { + push @{$self->{Parents}[-1]{Contents}}, $value; + } +} + +sub characters { + my ($self, $characters) = @_; + + my $state = $self->{States}[-1]; + if ($state eq 'as-string') { + push @{$self->{Data}}, $characters->{Data}; + } elsif ($state eq 'as-grove' and !$self->{SourceIsGrove}) { + $self->{GroveBuilder}->characters($characters); + } elsif ($state eq 'pcdata') { + push (@{$self->{Parents}[-1]{Contents}}, + $self->{CharacterDataType}->new(%$characters)); + } +} + +# we ignore processing instructions and ignorable whitespace by not +# defining those functions + +### +### private functions +### + +sub _parse_action { + my $self = shift; my $source = shift; + + my $action = {}; + + while ($#$source > -1) { + my $option = shift @$source; + if ($option eq '-holder') { + $action->{Holder} = 1; + } elsif ($option eq '-make') { + $action->{Make} = shift @$source; + } elsif ($option eq '-args') { + my $args = shift @$source; + $args =~ s/%\{($name_re)\}/(\$element->{Attributes}{'$1'})/g; + $action->{Args} = $args; + } elsif ($option eq '-field') { + $action->{Field} = shift @$source; + } elsif ($option eq '-push-field') { + $action->{Field} = shift @$source; + $action->{FieldIsArray} = 1; + } elsif ($option eq '-as-string') { + $action->{AsString} = 1; + } elsif ($option eq '-value') { + $action->{FieldValue} = shift @$source; + } elsif ($option eq '-grove') { + $self->{GroveBuilder} = 1; + $action->{AsGrove} = 1; + } elsif ($option eq '-grove-contents') { + $self->{GroveBuilder} = 1; + $action->{ContentsAsGrove} = 1; + } elsif ($option eq '-ignore') { + $action->{Ignore} = 1; + } elsif ($option eq '-pcdata') { + $action->{PCData} = 1; + } else { + die "$option: undefined option\n"; + } + } + + return $action; +} + +1; + +__END__ + +=head1 NAME + +XML::PatAct::ToObjects - An action module for creating Perl objects + +=head1 SYNOPSIS + + use XML::PatAct::ToObjects; + + my $patterns = [ PATTERN => [ OPTIONS ], + PATTERN => "PERL-CODE", + ... ]; + + my $matcher = XML::PatAct::ToObjects->new( Patterns => $patterns, + Matcher => $matcher, + CopyId => 1, + CopyAttributes => 1 ); + + +=head1 DESCRIPTION + +XML::PatAct::ToObjects is a PerlSAX handler for applying +pattern-action lists to XML parses or trees. XML::PatAct::ToObjects +creates Perl objects of the types and contents of the action items you +define. + +New XML::PatAct::ToObject instances are creating by calling `new()'. +Parameters can be passed as a list of key, value pairs or a hash. +`new()' requires the Patterns and Matcher parameters, the rest are +optional: + +=over 4 + +=item Patterns + +The pattern-action list to apply. + +=item Matcher + +An instance of the pattern or query matching module. + +=item CopyId + +Causes the `ID' attribute, if any, in a source XML element to be +copied to an `ID' attribute in newly created objects. Note that IDs +may be lost of no pattern matches that element or an object is not +created (C<-make>) for that element. + +=item CopyAttributes + +Causes all attributes of the element to be copied to the newly created +objects. + +=back + +Each action can either be a list of options defined below or a string +containing a fragment of Perl code. If the action is a string of Perl +code then simple then some simple substitutions are made as described +further below. + +Options that can be used in an action item containing an option-list: + +=over 4 + +=item B<-holder> + +Ignore this element, but continue processing it's children (compare to +B<-ignore>). C<-pcdata> may be used with this option. + +=item B<-ignore> + +Ignore (discard) this element and it's children (compare to B<-holder>). + +=item B<-pcdata> + +Character data in this element should be copied to the C +field. + +=item B<-make> I + +Create an object blessed into I, and continue processing this +element and it's children. I may be the type `C' to +simply create an anonyous hash. + +=item B<-args> I + +Use I in creating the object specified by B<-make>. This +is commonly used to copy element attributes into fields in the newly +created object. For example: + + -make => 'HASH', -args => 'URL => %{href}' + +would copy the `C' attribute in an element to the `C' field +of the newly created hash. + +=item B<-field> I + +Store this element, object, or children of this element in the parent +object's field named by I. + +=item B<-push-field> I + +Similar to B<-field>, except that I is an array and the +contents are pushed onto that array. + +=item B<-value> I + +Use I as a literal value to store in I, otherwise +ignoring this element and it's children. Only valid with B<-field> or +B<-push-field>. `C<%{I}>' notation can be used to +substitute the value of an attribute into the literal value. + +=item B<-as-string> + +Convert the contents of this element to a string (as in +C) and store in I. Only valid with +B<-field> or B<-push-field>. + +=item B<-grove> + +Copy this element to I without further processing. The element +can then be processed later as the Perl objects are manipulated. Only +valid with B<-field> or B<-push-field>. If ToObjects is used with +PerlSAX, this will use XML::Grove::Builder to build the grove element. + +=item B<-grove-contents> + +Used with B<-make>, B<-grove-contents> creates an object but then +takes all of the content of that element and stores it in Contents. + +=back + +If an action item is a string, that string is treated as a fragment of +Perl code. The following simple substitutions are performed on the +fragment to provide easy access to the information being converted: + +=over 4 + +=item B<@ELEM@> + +The object that caused this action to be called. If ToObjects is used +with PerlSAX this will be a hash with the element name and attributes, +with XML::Grove this will be the element object, with Data::Grove it +will be the matching object, and with XML::DOM it will be an +XML::DOM::Element. + +=back + +=head1 EXAMPLE + +The example pattern-action list below will convert the following XML +representing a Database schema: + + + + MyTable + A short summary + A long description that may + contain a subset of HTML + + MyColumn1 + A short summary + A long description + + + 42 + +
+
+ +into Perl objects looking like: + + [ + { Name => "MyTable", + Summary => "A short summary", + Description => $grove_object, + Columns => [ + { Name => "MyColumn1", + Summary => "A short summary", + Description => $grove_object, + Unique => 1, + NonNull => 1, + Default => 42 + } + ] + } + ] + +Here is a Perl script and pattern-action list that will perform the +conversion using the simple name matching pattern module +XML::PatAct::MatchName. The script accepts a Schema XML file as an +argument (C<$ARGV[0]>) to the script. This script creates a grove as +one of it's objects, so it requires the XML::Grove module. + + use XML::Parser::PerlSAX; + use XML::PatAct::MatchName; + use XML::PatAct::ToObjects; + + my $patterns = [ + 'schema' => [ qw{ -holder } ], + 'table' => [ qw{ -make Schema::Table } ], + 'name' => [ qw{ -field Name -as-string } ], + 'summary' => [ qw{ -field Summary -as-string } ], + 'description' => [ qw{ -field Description -grove } ], + 'column' => [ qw{ -make Schema::Column -push-field Columns } ], + 'unique' => [ qw{ -field Unique -value 1 } ], + 'non-null' => [ qw{ -field NonNull -value 1 } ], + 'default' => [ qw{ -field Default -as-string } ], + ]; + + my $matcher = XML::PatAct::MatchName->new( Patterns => $patterns ); + my $handler = XML::PatAct::ToObjects->new( Patterns => $patterns, + Matcher => $matcher); + + my $parser = XML::Parser::PerlSAX->new( Handler => $handler ); + my $schema = $parser->parse(Source => { SystemId => $ARGV[0] } ); + +=head1 TODO + +=over 4 + +=item * + +It'd be nice if patterns could be applied even in B<-as-string> and +B<-grove>. + +=item * + +Implement Perl code actions. + +=item * + +B<-as-xml> to write XML into the field. + +=back + + + +=head1 AUTHOR + +Ken MacLeod, ken@bitsko.slc.ut.us + +=head1 SEE ALSO + +perl(1), Data::Grove(3) + +``Using PatAct Modules'' and ``Creating PatAct Modules'' in libxml-perl. + +=cut diff --git a/lib/XML/Perl2SAX.pm b/lib/XML/Perl2SAX.pm new file mode 100644 index 0000000..0eea6d5 --- /dev/null +++ b/lib/XML/Perl2SAX.pm @@ -0,0 +1,120 @@ +# +# Copyright (C) 1998 Ken MacLeod +# XML::Perl2SAX is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: Perl2SAX.pm,v 1.3 1999/12/22 21:15:00 kmacleod Exp $ +# + +use strict; + +package XML::Perl2SAX; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? shift : { @_ }; + + return bless $self, $type; +} + +sub start_document { + my $self = shift; + my $properties = ($#_ == 0) ? shift : { @_ }; + + if ($properties->{Locator}) { + $self->{DocumentHandler}->setDocumentLocator($properties->{Locator}); + } + + $self->{DocumentHandler}->startDocument; +} + +sub end_document { + my $self = shift; + + $self->{DocumentHandler}->endDocument; +} + +sub start_element { + my $self = shift; + my $properties = shift; + + # FIXME depends on how Perl SAX treats attributes + $self->{DocumentHandler}->startElement($properties->{Name}, + $properties->{Attributes}); +} + +sub end_element { + my $self = shift; + my $properties = shift; + + $self->{DocumentHandler}->endElement($properties->{Name}); +} + +sub characters { + my $self = shift; + my $properties = shift; + + $self->{DocumentHandler}->characters($properties->{Data}, + 0, + length($properties->{Data})); +} + +sub ignorable_whitespace { + my $self = shift; + my $properties = shift; + + $self->{DocumentHandler}->ignorableWhitespace($properties->{Data}, + 0, + length($properties->{Data})); +} + +sub processing_instruction { + my $self = shift; + my $properties = shift; + + $self->{DocumentHandler}->processingInstruction($properties->{Target}, + $properties->{Data}); +} + +1; + +__END__ + +=head1 NAME + +XML::SAX2Perl -- translate Perl SAX methods to Java/CORBA style methods + +=head1 SYNOPSIS + + use XML::Perl2SAX; + + $perl2sax = XML::Perl2SAX(handler => $java_style_handler); + +=head1 DESCRIPTION + +C is a SAX filter that translates Perl style SAX +methods to Java/CORBA style method calls. This module performs the +inverse operation from C. + +C is a Perl SAX document handler. The `C' method takes +a `C' argument that is a Java/CORBA style handler that the +new Perl2SAX instance will call. The SAX interfaces are defined at +. + +=head1 AUTHOR + +Ken MacLeod + +=head1 SEE ALSO + +perl(1), XML::Perl2SAX(3). + + Extensible Markup Language (XML) + Simple API for XML (SAX) + +=cut diff --git a/lib/XML/SAX2Perl.pm b/lib/XML/SAX2Perl.pm new file mode 100644 index 0000000..11f02c8 --- /dev/null +++ b/lib/XML/SAX2Perl.pm @@ -0,0 +1,261 @@ +# +# Copyright (C) 1998 Ken MacLeod +# XML::SAX2Perl is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. +# +# $Id: SAX2Perl.pm,v 1.4 2001/07/23 15:47:15 kmacleod Exp $ +# + +use strict; + +package XML::SAX2Perl; + +use vars qw{ $VERSION }; + +# will be substituted by make-rel script +$VERSION = "0.08"; + +sub new { + my $type = shift; + my $self = ($#_ == 0) ? shift : { @_ }; + + return bless $self, $type; +} + +sub setDocumentLocator { + my $self = shift; + my $self->{Locator} = shift; +} + +sub startDocument { + my $self = shift; + + my @properties; + if (defined $self->{Locator}) { + push @properties, locator => $self->{Locator}; + } + + $self->{DocumentHandler}->start_document(@properties); +} + +sub endDocument { + my $self = shift; + + $self->{DocumentHandler}->end_document; +} + +sub startElement { + my $self = shift; + my $name = shift; + my $attributes = shift; + + # FIXME depends on how Perl SAX treats attributes + $self->{DocumentHandler}->start_element(Name => $name, Attributes => $attributes); +} + +sub endElement { + my $self = shift; + my $name = shift; + + $self->{DocumentHandler}->end_element(Name => $name); +} + +sub characters { + my $self = shift; + my $ch = shift; + my $start = shift; + my $length = shift; + + $self->{DocumentHandler}->characters(Data => substr($ch, $start, $length)); +} + +sub ignorableWhitespace { + my $self = shift; + my $ch = shift; + my $start = shift; + my $length = shift; + + $self->{DocumentHandler}->ignorable_whitespace(Data => substr($ch, $start, $length)); +} + +sub processingInstruction { + my $self = shift; + my $target = shift; + my $data = shift; + + $self->{DocumentHandler}->processing_instruction(Target => $target, Data => $data); +} + +1; + +__END__ + +=head1 NAME + +XML::SAX2Perl -- translate Java/CORBA style SAX methods to Perl methods + +=head1 SYNOPSIS + + use XML::SAX2Perl; + + $sax2perl = XML::SAX2Perl(Handler => $my_handler); + $sax->setDocumentHandler($sax2perl); + +=head1 DESCRIPTION + +C is a SAX filter that translates Java/CORBA style SAX +methods to Perl style method calls. This man page summarizes the +specific options, handlers, and properties supported by +C; please refer to the Perl SAX standard C +for general usage information. + +=head1 METHODS + +=over 4 + +=item new + +Creates a new parser object. Default options for parsing, described +below, are passed as key-value pairs or as a single hash. Options may +be changed directly in the parser object unless stated otherwise. +Options passed to `C' override the default options in the +parser object for the duration of the parse. + +=item parse + +Parses a document. Options, described below, are passed as key-value +pairs or as a single hash. Options passed to `C' override +default options in the parser object. + +=item location + +Returns the location as a hash: + + ColumnNumber The column number of the parse. + LineNumber The line number of the parse. + PublicId A string containing the public identifier, or undef + if none is available. + SystemId A string containing the system identifier, or undef + if none is available. + +=item SAX DocumentHandler Methods + +The following methods are DocumentHandler methods that the SAX 1.0 +parser will call and C will translate to Perl SAX +methods calls. See SAX 1.0 for details. + + setDocumentLocator(locator) + startDocument() + endDocument() + startElement(name, atts) + endElement(name) + characters(ch, start, length) + ignorableWhitespace(ch, start, length) + processingInstruction(target, data) + +=back + +=head1 OPTIONS + +The following options are supported by C: + + Handler default handler to receive events + DocumentHandler handler to receive document events + DTDHandler handler to receive DTD events + ErrorHandler handler to receive error events + EntityResolver handler to resolve entities + Locale locale to provide localisation for errors + Source hash containing the input source for parsing + +If no handlers are provided then all events will be silently ignored, +except for `C' which will cause a `C' to be +called after calling `C'. + +If a single string argument is passed to the `C' method, it +is treated as if a `C' option was given with a `C' +parameter. + +The `C' hash may contain the following parameters: + + ByteStream The raw byte stream (file handle) containing the + document. + String A string containing the document. + SystemId The system identifier (URI) of the document. + PublicId The public identifier. + Encoding A string describing the character encoding. + +If more than one of `C', `C', or `C', +then preference is given first to `C', then `C', +then `C'. + +=head1 HANDLERS + +The following handlers and properties are supported by +C: + +=head2 DocumentHandler methods + +=over 4 + +=item start_document + +Receive notification of the beginning of a document. + + Locator An object that can return the location of any SAX + document event. + +=item end_document + +Receive notification of the end of a document. + +No properties defined. + +=item start_element + +Receive notification of the beginning of an element. + + Name The element type name. + Attributes Attributes attached to the element, if any. + +ALPHA WARNING: The `C' value is not translated from the +SAX 1.0 value, so it will contain an AttributeList object. + +=item end_element + +Receive notification of the end of an element. + + Name The element type name. + +=item characters + +Receive notification of character data. + + Data The characters from the XML document. + +=item ignorable_whitespace + +Receive notification of ignorable whitespace in element content. + + Data The characters from the XML document. + +=item processing_instruction + +Receive notification of a processing instruction. + + Target The processing instruction target. + Data The processing instruction data, if any. + +=back + +=head1 AUTHOR + +Ken MacLeod + +=head1 SEE ALSO + +perl(1), XML::Perl2SAX(3). + + Extensible Markup Language (XML) + Simple API for XML (SAX) + +=cut diff --git a/libxml-perl-0.08.spec b/libxml-perl-0.08.spec new file mode 100644 index 0000000..1483ac9 --- /dev/null +++ b/libxml-perl-0.08.spec @@ -0,0 +1,78 @@ +Summary: Collection of Perl modules for working with XML +Name: libxml-perl +Version: 0.08 +Release: 1 +Source: http://www.perl.com/CPAN/modules/by-module/XML/libxml-perl-0.08.tar.gz +Copyright: Artistic or GPL +Group: Applications/Publishing/XML +URL: http://www.perl.com/ +Packager: ken@bitsko.slc.ut.us (Ken MacLeod) +BuildRoot: /tmp/libxml-perl + +# +# $Id: libxml-perl.spec,v 1.4 1999/08/16 16:10:43 kmacleod Exp $ +# + +%description +libxml-perl is a collection of Perl modules for working with XML. + +%prep +%setup + +perl Makefile.PL INSTALLDIRS=perl + +%build + +make + +%install + +make PREFIX="${RPM_ROOT_DIR}/usr" pure_install + +DOCDIR="${RPM_ROOT_DIR}/usr/doc/libxml-perl-0.08-1" +mkdir -p "$DOCDIR/examples" +for ii in PerlSAX.pod UsingPerlSAX.pod interface-style.pod modules.xml; do + cp doc/$ii "$DOCDIR/$ii" + chmod 644 "$DOCDIR/$ii" +done +for ii in README Changes examples/*; do + cp $ii "$DOCDIR/$ii" + chmod 644 "$DOCDIR/$ii" +done + +%files + +/usr/doc/libxml-perl-0.08-1 + +/usr/lib/perl5/Data/Grove.pm +/usr/lib/perl5/Data/Grove/Parent.pm +/usr/lib/perl5/Data/Grove/Visitor.pm +/usr/lib/perl5/XML/ESISParser.pm +/usr/lib/perl5/XML/Handler/CanonXMLWriter.pm +/usr/lib/perl5/XML/Handler/Sample.pm +/usr/lib/perl5/XML/Handler/Subs.pm +/usr/lib/perl5/XML/Handler/XMLWriter.pm +/usr/lib/perl5/XML/SAX2Perl.pm +/usr/lib/perl5/XML/Perl2SAX.pm +/usr/lib/perl5/XML/Parser/PerlSAX.pm +/usr/lib/perl5/XML/PatAct/ActionTempl.pm +/usr/lib/perl5/XML/PatAct/Amsterdam.pm +/usr/lib/perl5/XML/PatAct/MatchName.pm +/usr/lib/perl5/XML/PatAct/PatternTempl.pm +/usr/lib/perl5/XML/PatAct/ToObjects.pm +/usr/lib/perl5/man/man3/Data::Grove.3 +/usr/lib/perl5/man/man3/Data::Grove::Parent.3 +/usr/lib/perl5/man/man3/Data::Grove::Visitor.3 +/usr/lib/perl5/man/man3/XML::Handler::CanonXMLWriter.3 +/usr/lib/perl5/man/man3/XML::Handler::Sample.3 +/usr/lib/perl5/man/man3/XML::Handler::Subs.3 +/usr/lib/perl5/man/man3/XML::Handler::XMLWriter.3 +/usr/lib/perl5/man/man3/XML::ESISParser.3 +/usr/lib/perl5/man/man3/XML::SAX2Perl.3 +/usr/lib/perl5/man/man3/XML::Perl2SAX.3 +/usr/lib/perl5/man/man3/XML::Parser::PerlSAX.3 +/usr/lib/perl5/man/man3/XML::PatAct::ActionTempl.3 +/usr/lib/perl5/man/man3/XML::PatAct::Amsterdam.3 +/usr/lib/perl5/man/man3/XML::PatAct::MatchName.3 +/usr/lib/perl5/man/man3/XML::PatAct::PatternTempl.3 +/usr/lib/perl5/man/man3/XML::PatAct::ToObjects.3 diff --git a/libxml-perl.spec b/libxml-perl.spec new file mode 100644 index 0000000..f8e1c52 --- /dev/null +++ b/libxml-perl.spec @@ -0,0 +1,78 @@ +Summary: Collection of Perl modules for working with XML +Name: libxml-perl +Version: @VERSION@ +Release: 1 +Source: http://www.perl.com/CPAN/modules/by-module/XML/libxml-perl-@VERSION@.tar.gz +Copyright: Artistic or GPL +Group: Applications/Publishing/XML +URL: http://www.perl.com/ +Packager: ken@bitsko.slc.ut.us (Ken MacLeod) +BuildRoot: /tmp/libxml-perl + +# +# $Id: libxml-perl.spec,v 1.4 1999/08/16 16:10:43 kmacleod Exp $ +# + +%description +libxml-perl is a collection of Perl modules for working with XML. + +%prep +%setup + +perl Makefile.PL INSTALLDIRS=perl + +%build + +make + +%install + +make PREFIX="${RPM_ROOT_DIR}/usr" pure_install + +DOCDIR="${RPM_ROOT_DIR}/usr/doc/libxml-perl-@VERSION@-1" +mkdir -p "$DOCDIR/examples" +for ii in PerlSAX.pod UsingPerlSAX.pod interface-style.pod modules.xml; do + cp doc/$ii "$DOCDIR/$ii" + chmod 644 "$DOCDIR/$ii" +done +for ii in README Changes examples/*; do + cp $ii "$DOCDIR/$ii" + chmod 644 "$DOCDIR/$ii" +done + +%files + +/usr/doc/libxml-perl-@VERSION@-1 + +/usr/lib/perl5/Data/Grove.pm +/usr/lib/perl5/Data/Grove/Parent.pm +/usr/lib/perl5/Data/Grove/Visitor.pm +/usr/lib/perl5/XML/ESISParser.pm +/usr/lib/perl5/XML/Handler/CanonXMLWriter.pm +/usr/lib/perl5/XML/Handler/Sample.pm +/usr/lib/perl5/XML/Handler/Subs.pm +/usr/lib/perl5/XML/Handler/XMLWriter.pm +/usr/lib/perl5/XML/SAX2Perl.pm +/usr/lib/perl5/XML/Perl2SAX.pm +/usr/lib/perl5/XML/Parser/PerlSAX.pm +/usr/lib/perl5/XML/PatAct/ActionTempl.pm +/usr/lib/perl5/XML/PatAct/Amsterdam.pm +/usr/lib/perl5/XML/PatAct/MatchName.pm +/usr/lib/perl5/XML/PatAct/PatternTempl.pm +/usr/lib/perl5/XML/PatAct/ToObjects.pm +/usr/lib/perl5/man/man3/Data::Grove.3 +/usr/lib/perl5/man/man3/Data::Grove::Parent.3 +/usr/lib/perl5/man/man3/Data::Grove::Visitor.3 +/usr/lib/perl5/man/man3/XML::Handler::CanonXMLWriter.3 +/usr/lib/perl5/man/man3/XML::Handler::Sample.3 +/usr/lib/perl5/man/man3/XML::Handler::Subs.3 +/usr/lib/perl5/man/man3/XML::Handler::XMLWriter.3 +/usr/lib/perl5/man/man3/XML::ESISParser.3 +/usr/lib/perl5/man/man3/XML::SAX2Perl.3 +/usr/lib/perl5/man/man3/XML::Perl2SAX.3 +/usr/lib/perl5/man/man3/XML::Parser::PerlSAX.3 +/usr/lib/perl5/man/man3/XML::PatAct::ActionTempl.3 +/usr/lib/perl5/man/man3/XML::PatAct::Amsterdam.3 +/usr/lib/perl5/man/man3/XML::PatAct::MatchName.3 +/usr/lib/perl5/man/man3/XML::PatAct::PatternTempl.3 +/usr/lib/perl5/man/man3/XML::PatAct::ToObjects.3 diff --git a/t/amsterdam.t b/t/amsterdam.t new file mode 100644 index 0000000..e45b482 --- /dev/null +++ b/t/amsterdam.t @@ -0,0 +1,48 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: amsterdam.t,v 1.1 1999/08/28 17:46:57 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..2\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; +use XML::PatAct::MatchName; +use XML::PatAct::Amsterdam; + + +$loaded = 1; +print "ok 1\n"; + +$patterns = + [ + 'outer' => { Before => "Outer-before, '[attr]'", + After => "Outer-after\n" }, + 'inner' => { Before => "Inner" }, + ]; + +my $matcher = XML::PatAct::MatchName->new( Patterns => $patterns ); +my $handler = XML::PatAct::Amsterdam->new( Patterns => $patterns, + Matcher => $matcher, + AsString => 1 ); +my $parser = XML::Parser::PerlSAX->new( Handler => $handler ); +$string = $parser->parse(Source => { String => <<'EOF;' } ); + + + +EOF; + +$expected = <<"EOF;"; +Outer-before, 'an attr' + Inner +Outer-after +EOF; + +print (($string eq $expected) ? "ok 2\n" : "not ok 2\n"); diff --git a/t/canon_xml_writer.t b/t/canon_xml_writer.t new file mode 100644 index 0000000..7d8e56b --- /dev/null +++ b/t/canon_xml_writer.t @@ -0,0 +1,146 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: canon_xml_writer.t,v 1.2 1999/08/10 21:42:39 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..5\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; +use XML::Handler::CanonXMLWriter; + +$loaded = 1; +print "ok 1\n"; + +######################### End of black magic. + +# Insert your test code below (better if it prints "ok 13" +# (correspondingly "not ok 13") depending on the success of chunk 13 +# of the test code): + +my $parser = XML::Parser::PerlSAX->new; + +my $writer = XML::Handler::CanonXMLWriter->new; +if ($writer) { + print "ok 2\n"; +} else { + print "not ok 2\n"; + exit; +} + + +# +# The following XML is copied from XML::Parser by Clark Cooper +# + +# XML string for tests + +my $xmlstring =<<"End_of_XML;"; + + + ]> + + First line in foo + + + 1st line in bar + 2nd line in bar + 3rd line in bar + + + +End_of_XML; + +### +### plain test +### + +$expected_result = <<'End_of_XML;'; + First line in foo 1st line in bar 2nd line in bar 3rd line in bar +End_of_XML; +$expected_result =~ s/\n$//s; + +$canon_xml = $parser->parse( Source => { String => $xmlstring }, + Handler => $writer ); + +if ($canon_xml eq $expected_result) { + print "ok 3\n"; +} else { + warn "---- expected result ----\n"; + warn "$expected_result\n"; + warn "---- actual result ----\n"; + warn "$canon_xml\n"; + print "not ok 3\n"; +} + +### +### Test PrintComments option +### + +$expected_result = <<'End_of_XML;'; + First line in foo 1st line in bar 2nd line in bar 3rd line in bar +End_of_XML; +$expected_result =~ s/\n$//s; + +$writer->{PrintComments} = 1; +$canon_xml = $parser->parse( Source => { String => $xmlstring }, + Handler => $writer ); + +if ($canon_xml eq $expected_result) { + print "ok 4\n"; +} else { + warn "---- expected result ----\n"; + warn "$expected_result\n"; + warn "---- actual result ----\n"; + warn "$canon_xml\n"; + print "not ok 4\n"; +} + +undef $writer->{PrintComments}; + +### +### Test James Clark's XML test suite +### + +$xml_test = (defined $ENV{XMLTEST}) ? $ENV{XMLTEST} : "$ENV{HOME}/xmltest"; + +# allow test to skip if directory does not exist and MUST_TEST isn't set +if (!-d $xml_test && !defined($ENV{MUST_TEST})) { + print "ok 5\n"; + exit; +} + +$tested_file = 0; +foreach $file (glob("$xml_test/valid/sa/*.xml")) { + $tested_file = 1; + $canon_xml = $parser->parse( Source => { SystemId => $file }, + Handler => $writer ); + # add the `out' dir to get the corresponding canon xml + ($out_file = $file) =~ s|/([^/]+)$|/out/$1|; + open (CANON, $out_file) + or die "$out_file: $!\n"; + $expected_result = join('', ); + close (CANON); + if ($canon_xml ne $expected_result) { + warn "---- expected result for $file ----\n"; + warn "$expected_result\n"; + warn "---- actual result ----\n"; + warn "$canon_xml\n"; + $not_ok = 1; + } +} + +if (!$tested_file || $not_ok) { + print "not ok 5\n"; +} else { + print "ok 5\n"; +} diff --git a/t/schema.t b/t/schema.t new file mode 100644 index 0000000..7ee715b --- /dev/null +++ b/t/schema.t @@ -0,0 +1,79 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: schema.t,v 1.1 1999/08/10 21:42:39 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..2\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; +use XML::PatAct::MatchName; +use XML::PatAct::ToObjects; + +$loaded = 1; +print "ok 1\n"; + +my $patterns = [ + 'schema' => [ qw{ -holder } ], + 'table' => [ qw{ -make Schema::Table } ], + 'name' => [ qw{ -field Name -as-string } ], + 'summary' => [ qw{ -field Summary -as-string } ], + 'description' => [ qw{ -field Description -as-string } ], + 'column' => [ qw{ -make Schema::Column -push-field Columns } ], + 'unique' => [ qw{ -field Unique -value 1 } ], + 'non-null' => [ qw{ -field NonNull -value 1 } ], + 'default' => [ qw{ -field Default -as-string } ], + ]; + +my $matcher = XML::PatAct::MatchName->new( Patterns => $patterns ); +my $handler = XML::PatAct::ToObjects->new( Patterns => $patterns, + Matcher => $matcher); + +my $parser = XML::Parser::PerlSAX->new( Handler => $handler ); +$schema = $parser->parse(Source => { String => <<'EOF' } ); + + + MyTable + A short summary + A long description that may + contain a subset of HTML + + MyColumn1 + A short summary + A long description + + + 42 + +
+
+EOF + +$not_ok = 0; +$not_ok |= (!defined($schema)) || (ref($schema->[0]) ne 'Schema::Table'); +$not_ok |= (!defined($schema->[0]{Name})) || ($schema->[0]{Name} ne 'MyTable'); +$not_ok |= (!defined($schema->[0]{Summary})) + || ($schema->[0]{Summary} ne 'A short summary'); +$not_ok |= (!defined($schema->[0]{Description})); +$not_ok |= (!defined($schema->[0]{Columns})) + || (ref($schema->[0]{Columns}[0]) ne 'Schema::Column'); +$not_ok |= (!defined($schema->[0]{Columns}[0]{Name})) + || ($schema->[0]{Columns}[0]{Name} ne 'MyColumn1'); +$not_ok |= (!defined($schema->[0]{Columns}[0]{Summary})) + || ($schema->[0]{Columns}[0]{Summary} ne 'A short summary'); +$not_ok |= !defined($schema->[0]{Columns}[0]{Description}); +$not_ok |= (!defined($schema->[0]{Columns}[0]{Unique})) + || ($schema->[0]{Columns}[0]{Unique} != 1); +$not_ok |= (!defined($schema->[0]{Columns}[0]{NonNull})) + || ($schema->[0]{Columns}[0]{NonNull} != 1); +$not_ok |= (!defined($schema->[0]{Columns}[0]{Default})) + || ($schema->[0]{Columns}[0]{Default} != 42); + +print $not_ok ? "not ok 2\n" : "ok 2\n"; diff --git a/t/stream.t b/t/stream.t new file mode 100644 index 0000000..edd17b7 --- /dev/null +++ b/t/stream.t @@ -0,0 +1,104 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: stream.t,v 1.2 2003/10/21 16:01:54 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..11\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; +use XML::Handler::XMLWriter; + + +$loaded = 1; +print "ok 1\n"; + +my $subs = MySubs->new( AsString => 1 ); +my $parser = XML::Parser::PerlSAX->new( Handler => $subs ); +$string = $parser->parse(Source => { Encoding => 'ISO-8859-1', + String => <<"EOF;" } ); + + + + + ]> + + First line in foo + Fran is &fran; and Zoe is &zoe; + + + 1st line in bar + 2nd line in bar + 3rd line in bar + + + This, '\240', would be a bad character in UTF-8. + +EOF; + +foreach $test (2..10) { + print $subs->{Tests}[$test] ? "ok $test\n" : "not ok $test\n" ; +} + +$expected = <<"EOF;"; + + + First line in foo + Fran is fran-def and Zoe is zoe.ent + + + 1st line in bar + 2nd line in bar + 3rd line in bar + + + This, '\240', would be a bad character in UTF-8. + +EOF; + +print (($string eq $expected) ? "ok 11\n" : "not ok 11\n"); + +package MySubs; +use vars qw{ @ISA }; +BEGIN { @ISA = qw{ XML::Handler::XMLWriter }; }; + +sub s_zap { + my ($self, $element) = @_; + + $self->{Tests}[2] = 1; # we got here + $self->{Tests}[3] = 1 + if $element->{Name} eq 'zap'; + $self->{Tests}[4] = 1 + if $element->{Name} eq $self->{Names}[-1]; + $self->{Tests}[5] = 1 + if $element == $self->{Nodes}[-1]; + $self->{Tests}[6] = 1 + if $#{$self->{Names}} == 1; + $self->{Tests}[7] = 1 + if $#{$self->{Nodes}} == 1; + + $element->{Attributes}{'fubar'} = 1; + + $self->print_start_element($element); +} + +sub e_zap { + my ($self, $element) = @_; + + $self->{Tests}[8] = 1; # we got here + $self->{Tests}[9] = 1 + if $self->in_element('zap'); + $self->{Tests}[10] = 1 + if $self->within_element('zap') == 1; + + $self->print_end_element($element); +} diff --git a/t/subs.t b/t/subs.t new file mode 100644 index 0000000..c7eda76 --- /dev/null +++ b/t/subs.t @@ -0,0 +1,63 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: subs.t,v 1.1 1999/08/16 16:04:03 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..10\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; +use XML::Handler::Subs; + + +$loaded = 1; +print "ok 1\n"; + +my $subs = MySubs->new( ); +my $parser = XML::Parser::PerlSAX->new( Handler => $subs ); +$parser->parse(Source => { String => <<'EOF' } ); + + + +EOF + +foreach $test (2..10) { + print $subs->{Tests}[$test] ? "ok $test\n" : "not ok $test\n" ; +} + +package MySubs; +use vars qw{ @ISA }; +BEGIN { @ISA = qw{ XML::Handler::Subs }; }; + +sub s_foo__it { + my ($self, $element) = @_; + + $self->{Tests}[2] = 1; # we got here + $self->{Tests}[3] = 1 + if $element->{Name} eq 'foo:-it'; + $self->{Tests}[4] = 1 + if $element->{Name} eq $self->{Names}[-1]; + $self->{Tests}[5] = 1 + if $element == $self->{Nodes}[-1]; + $self->{Tests}[6] = 1 + if $#{$self->{Names}} == 0; + $self->{Tests}[7] = 1 + if $#{$self->{Nodes}} == 0; +} + +sub e_foo__it { + my ($self, $element) = @_; + + $self->{Tests}[8] = 1; # we got here + $self->{Tests}[9] = 1 + if $self->in_element('foo:-it'); + $self->{Tests}[10] = 1 + if $self->within_element('foo:-it') == 1; +} diff --git a/t/xp_sax.t b/t/xp_sax.t new file mode 100644 index 0000000..a9e72ad --- /dev/null +++ b/t/xp_sax.t @@ -0,0 +1,225 @@ +# Hey Emacs, this is -*- perl -*- ! +# +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' +# +# $Id: xp_sax.t,v 1.4 1999/09/10 00:30:12 kmacleod Exp $ +# + +######################### We start with some black magic to print on failure. + +# Change 1..1 below to 1..last_test_to_print . +# (It may become useful if the test is moved to ./t subdirectory.) + +BEGIN { $| = 1; print "1..15\n"; } +END {print "not ok 1\n" unless $loaded;} +use XML::Parser::PerlSAX; + +$loaded = 1; +print "ok 1\n"; + +######################### End of black magic. + +# Insert your test code below (better if it prints "ok 13" +# (correspondingly "not ok 13") depending on the success of chunk 13 +# of the test code): + +# Test Plan: +# +# * done; standard loading test +# * not done; parse a document with data for all events +# * not done; check all properties returned from events +# * not done; check location + +# +# The following is copied from XML::Parser by Clark Cooper +# +open(ZOE, '>zoe.ent'); +print ZOE "'cute'"; +close(ZOE); + +# XML string for tests + +my $xmlstring =<<"End_of_XML;"; + + + + + ]> + + First line in foo + Fran is &fran; and Zoe is &zoe; + + + 1st line in bar + 2nd line in bar + 3rd line in bar + + + This, '\240', would be a bad character in UTF-8. + + +End_of_XML; + +# Handlers +my @tests; +my $pos =''; + +my $parser = XML::Parser::PerlSAX->new; +if ($parser) { + print "ok 2\n"; +} else { + print "not ok 2\n"; + exit; +} + +# Tests 4..15 +eval { + $parser->parse( Source => { String => $xmlstring, + Encoding => 'ISO-8859-1' }, + Handler => TestHandler->new( Tests => \@tests ) ); +}; +warn $@ if $@; + +if ($@) { + print "Parse error:\n$@"; +} else { + $tests[3] ++; +} + +unlink('zoe.ent') if (-f 'zoe.ent'); + +$xmlstring = <<'EOF;'; + +]> +&anEntRef; +EOF; + +eval { +$parser->parse( Source => { String => $xmlstring }, + Handler => NoEntRefsHandler->new( Tests => \@tests ) ); +}; +warn $@ if $@; + +eval { +$parser->parse( Source => { String => $xmlstring }, + Handler => EntRefsHandler->new( Tests => \@tests ) ); +}; +warn $@ if $@; + +for (3 .. 15) +{ + print "not " unless $tests[$_]; + print "ok $_\n"; +} + +exit; + +package TestHandler; + +sub new { + my $type = shift; + return bless { @_ }, $type; +} + +sub characters { + my $self = shift; + $self->{Tests}[4] ++; +} + +sub start_element { + my $self = shift; + $self->{Tests}[5] ++; +} + +sub end_element { + my $self = shift; + $self->{Tests}[6] ++; +} + +sub processing_instruction { + my $self = shift; + $self->{Tests}[7] ++; +} + +sub notation_decl { + my $self = shift; + $self->{Tests}[8] ++; +} + +sub unparsed_entity_decl { + my $self = shift; + $self->{Tests}[9] ++; +} + +sub start_cdata { + my $self = shift; + $self->{Tests}[12] ++; +} + +sub end_cdata { + my $self = shift; + $self->{Tests}[13] ++; +} + +sub resolve_entity { + my $self = shift; + my $entity = shift; + + if ($entity->{SystemId} eq 'fran-def') { + $self->{Tests}[10] ++; + return { String => 'pretty' }; + } elsif ($entity->{SystemId} eq 'zoe.ent') { + $self->{Tests}[11] ++; + local(*FOO); + open(FOO, $entity->{SystemId}) or die "Couldn't open $entity->{SystemId}"; + return { ByteStream => *FOO }; + } +} + +package NoEntRefsHandler; + +sub new { + my $type = shift; + return bless { @_ }, $type; +} + +sub characters { + my $self = shift; + my $characters = shift; + + if ($characters->{Data} eq 'The Ent Ref') { + $self->{Tests}[14] ++; + } +} + +package EntRefsHandler; + +sub new { + my $type = shift; + return bless { @_ }, $type; +} + +sub characters { + my $self = shift; + my $characters = shift; + + if ($characters->{Data} eq 'The Ent Ref') { + die "shouldn't have made it here"; + } +} + +sub entity_reference { + my $self = shift; + my $ent_ref = shift; + + if (($ent_ref->{Name} eq 'anEntRef') + && ($ent_ref->{Value} eq 'The Ent Ref')) { + $self->{Tests}[15] ++; + } +}