From 4513e9bce04e67ed679d8d18fe187b921e899a62 Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 16 2020 12:35:11 +0000 Subject: perl-HTML-Tagset-3.20 base --- diff --git a/Changes b/Changes new file mode 100644 index 0000000..563c4ab --- /dev/null +++ b/Changes @@ -0,0 +1,32 @@ +Revision history for Perl extension HTML::Tagset + +2008-02-29 Andy Lester + + * Release 3.20 -- Added
to the list of p_closure_barriers. + +2005-11-08 Andy Lester + + * Release 3.10 -- Another rebundling, with a new maintainer. + +2004-12-29 Sean M. Burke + + * Realese 3.04 -- just a rebundling; no actual code changes + +2000-10-20 Sean M. Burke + + * Release 3.03 + Added 'tr' => ['background'] at the suggestion of Renzo Toma + (renzo.toma@veronica.nl), who notes: "Netscape Communicator 4.73 + (probably even earlier) supports it, Internet Explorer 5.5 does + not (yet)." + +2000-09-04 Sean M. Burke + + * Release 3.02 + Added %is_Possible_Strict_P_Content. + +2000-08-21 Sean M. Burke + + * Release 3.01 -- first release. + Data tables copied from previous version of HTML::Element + and HTML::TreeBuilder, with some additions. diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..7eeb29e --- /dev/null +++ b/MANIFEST @@ -0,0 +1,10 @@ +Changes +Tagset.pm +Makefile.PL +MANIFEST +MANIFEST.SKIP +README +t/00_about_verbose.t +t/01_old_junk.t +t/pod.t +META.yml Module meta-data (added by MakeMaker) diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP new file mode 100644 index 0000000..b1dac19 --- /dev/null +++ b/MANIFEST.SKIP @@ -0,0 +1,7 @@ +^MANIFEST\.bak$ +Makefile(\.old)?$ +\.rej$ +CVS +blib +~ + diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..b121d06 --- /dev/null +++ b/META.yml @@ -0,0 +1,13 @@ +--- #YAML:1.0 +name: HTML-Tagset +version: 3.20 +abstract: data tables useful in parsing HTML +license: ~ +author: + - Andy Lester +generated_by: ExtUtils::MakeMaker version 6.42 +distribution_type: module +requires: +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.3.html + version: 1.3 diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..cd6fe85 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,19 @@ +# This -*-perl-*- program writes the Makefile for installing this distribution. +# +# See "perldoc perlmodinstall" or "perldoc ExtUtils::MakeMaker" for +# info on how to control how the installation goes. + +require 5.004; +use strict; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'HTML::Tagset', + AUTHOR => 'Andy Lester ', + VERSION_FROM => 'Tagset.pm', # finds $VERSION + ABSTRACT_FROM => 'Tagset.pm', # retrieve abstract from module + PMLIBDIRS => [qw(lib/)], + dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, + clean => { FILES => 'HTML-Tagset-*' }, + +); diff --git a/README b/README new file mode 100644 index 0000000..efb8ce3 --- /dev/null +++ b/README @@ -0,0 +1,66 @@ +README for HTML::Tagset + + HTML::Tagset + +This module contains data tables useful in dealing with HTML. + +It provides no functions or methods. + + + +PREREQUISITES + +This suite requires Perl 5. + +HTML::Tagset doesn't use any nonstandard modules. + + +INSTALLATION + +You install HTML::Tagset, as you would install any perl module +library, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of HTML::Tagset in your home +directory, then you should try to produce the initial Makefile with +something like this command: + + perl Makefile.PL LIB=~/perl + + +DOCUMENTATION + +POD-format documentation is included in Tagset.pm. POD is readable +with the 'perldoc' utility. See ChangeLog for recent changes. + + +MACPERL INSTALLATION NOTES + +Don't bother with the makefiles. Just make a HTML directory in your +MacPerl site_lib or lib directory, and move Tagset.pm into there. + + +SUPPORT + +Questions, bug reports, and suggestions for HTML::Tagset should just +be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of HTML::Tagset is available from the +Comprehensive Perl Archive Network (CPAN). Visit + to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1999,2000 Sean M. Burke ; Copyright +1995-2000 Gisle Aas; all rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. diff --git a/Tagset.pm b/Tagset.pm new file mode 100644 index 0000000..754137f --- /dev/null +++ b/Tagset.pm @@ -0,0 +1,471 @@ +package HTML::Tagset; + +use strict; + +=head1 NAME + +HTML::Tagset - data tables useful in parsing HTML + +=head1 VERSION + +Version 3.20 + +=cut + +use vars qw( $VERSION ); + +$VERSION = '3.20'; + +=head1 SYNOPSIS + + use HTML::Tagset; + # Then use any of the items in the HTML::Tagset package + # as need arises + +=head1 DESCRIPTION + +This module contains several data tables useful in various kinds of +HTML parsing operations. + +Note that all tag names used are lowercase. + +In the following documentation, a "hashset" is a hash being used as a +set -- the hash conveys that its keys are there, and the actual values +associated with the keys are not significant. (But what values are +there, are always true.) + +=cut + +use vars qw( + $VERSION + %emptyElement %optionalEndTag %linkElements %boolean_attr + %isHeadElement %isBodyElement %isPhraseMarkup + %is_Possible_Strict_P_Content + %isHeadOrBodyElement + %isList %isTableElement %isFormElement + %isKnown %canTighten + @p_closure_barriers + %isCDATA_Parent +); + +=head1 VARIABLES + +Note that none of these variables are exported. + +=head2 hashset %HTML::Tagset::emptyElement + +This hashset has as values the tag-names (GIs) of elements that cannot +have content. (For example, "base", "br", "hr".) So +C<$HTML::Tagset::emptyElement{'hr'}> exists and is true. +C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true. + +=cut + +%emptyElement = map {; $_ => 1 } qw(base link meta isindex + img br hr wbr + input area param + embed bgsound spacer + basefont col frame + ~comment ~literal + ~declaration ~pi + ); +# The "~"-initial names are for pseudo-elements used by HTML::Entities +# and TreeBuilder + +=head2 hashset %HTML::Tagset::optionalEndTag + +This hashset lists tag-names for elements that can have content, but whose +end-tags are generally, "safely", omissible. Example: +C<$HTML::Tagset::emptyElement{'li'}> exists and is true. + +=cut + +%optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td); + +=head2 hash %HTML::Tagset::linkElements + +Values in this hash are tagnames for elements that might contain +links, and the value for each is a reference to an array of the names +of attributes whose values can be links. + +=cut + +%linkElements = +( + 'a' => ['href'], + 'applet' => ['archive', 'codebase', 'code'], + 'area' => ['href'], + 'base' => ['href'], + 'bgsound' => ['src'], + 'blockquote' => ['cite'], + 'body' => ['background'], + 'del' => ['cite'], + 'embed' => ['pluginspage', 'src'], + 'form' => ['action'], + 'frame' => ['src', 'longdesc'], + 'iframe' => ['src', 'longdesc'], + 'ilayer' => ['background'], + 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'], + 'input' => ['src', 'usemap'], + 'ins' => ['cite'], + 'isindex' => ['action'], + 'head' => ['profile'], + 'layer' => ['background', 'src'], + 'link' => ['href'], + 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'], + 'q' => ['cite'], + 'script' => ['src', 'for'], + 'table' => ['background'], + 'td' => ['background'], + 'th' => ['background'], + 'tr' => ['background'], + 'xmp' => ['href'], +); + +=head2 hash %HTML::Tagset::boolean_attr + +This hash (not hashset) lists what attributes of what elements can be +printed without showing the value (for example, the "noshade" attribute +of "hr" elements). For elements with only one such attribute, its value +is simply that attribute name. For elements with many such attributes, +the value is a reference to a hashset containing all such attributes. + +=cut + +%boolean_attr = ( +# TODO: make these all hashes + 'area' => 'nohref', + 'dir' => 'compact', + 'dl' => 'compact', + 'hr' => 'noshade', + 'img' => 'ismap', + 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 }, + 'menu' => 'compact', + 'ol' => 'compact', + 'option' => 'selected', + 'select' => 'multiple', + 'td' => 'nowrap', + 'th' => 'nowrap', + 'ul' => 'compact', +); + +#========================================================================== +# List of all elements from Extensible HTML version 1.0 Transitional DTD: +# +# a abbr acronym address applet area b base basefont bdo big +# blockquote body br button caption center cite code col colgroup +# dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6 +# head hr html i iframe img input ins isindex kbd label legend li +# link map menu meta noframes noscript object ol optgroup option p +# param pre q s samp script select small span strike strong style +# sub sup table tbody td textarea tfoot th thead title tr tt u ul +# var +# +# Varia from Mozilla source internal table of tags: +# Implemented: +# xmp listing wbr nobr frame frameset noframes ilayer +# layer nolayer spacer embed multicol +# But these are unimplemented: +# sound?? keygen?? server?? +# Also seen here and there: +# marquee?? app?? (both unimplemented) +#========================================================================== + +=head2 hashset %HTML::Tagset::isPhraseMarkup + +This hashset contains all phrasal-level elements. + +=cut + +%isPhraseMarkup = map {; $_ => 1 } qw( + span abbr acronym q sub sup + cite code em kbd samp strong var dfn strike + b i u s tt small big + a img br + wbr nobr blink + font basefont bdo + spacer embed noembed +); # had: center, hr, table + + +=head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content + +This hashset contains all phrasal-level elements that be content of a +P element, for a strict model of HTML. + +=cut + +%is_Possible_Strict_P_Content = ( + %isPhraseMarkup, + %isFormElement, + map {; $_ => 1} qw( object script map ) + # I've no idea why there's these latter exceptions. + # I'm just following the HTML4.01 DTD. +); + +#from html4 strict: +# +# +# +# +# +# +# +# +# +# + +=head2 hashset %HTML::Tagset::isHeadElement + +This hashset contains all elements that elements that should be +present only in the 'head' element of an HTML document. + +=cut + +%isHeadElement = map {; $_ => 1 } + qw(title base link meta isindex script style object bgsound); + +=head2 hashset %HTML::Tagset::isList + +This hashset contains all elements that can contain "li" elements. + +=cut + +%isList = map {; $_ => 1 } qw(ul ol dir menu); + +=head2 hashset %HTML::Tagset::isTableElement + +This hashset contains all elements that are to be found only in/under +a "table" element. + +=cut + +%isTableElement = map {; $_ => 1 } + qw(tr td th thead tbody tfoot caption col colgroup); + +=head2 hashset %HTML::Tagset::isFormElement + +This hashset contains all elements that are to be found only in/under +a "form" element. + +=cut + +%isFormElement = map {; $_ => 1 } + qw(input select option optgroup textarea button label); + +=head2 hashset %HTML::Tagset::isBodyMarkup + +This hashset contains all elements that are to be found only in/under +the "body" element of an HTML document. + +=cut + +%isBodyElement = map {; $_ => 1 } qw( + h1 h2 h3 h4 h5 h6 + p div pre plaintext address blockquote + xmp listing + center + + multicol + iframe ilayer nolayer + bgsound + + hr + ol ul dir menu li + dl dt dd + ins del + + fieldset legend + + map area + applet param object + isindex script noscript + table + center + form + ), + keys %isFormElement, + keys %isPhraseMarkup, # And everything phrasal + keys %isTableElement, +; + + +=head2 hashset %HTML::Tagset::isHeadOrBodyElement + +This hashset includes all elements that I notice can fall either in +the head or in the body. + +=cut + +%isHeadOrBodyElement = map {; $_ => 1 } + qw(script isindex style object map area param noscript bgsound); + # i.e., if we find 'script' in the 'body' or the 'head', don't freak out. + + +=head2 hashset %HTML::Tagset::isKnown + +This hashset lists all known HTML elements. + +=cut + +%isKnown = (%isHeadElement, %isBodyElement, + map{; $_=>1 } + qw( head body html + frame frameset noframes + ~comment ~pi ~directive ~literal +)); + # that should be all known tags ever ever + + +=head2 hashset %HTML::Tagset::canTighten + +This hashset lists elements that might have ignorable whitespace as +children or siblings. + +=cut + +%canTighten = %isKnown; +delete @canTighten{ + keys(%isPhraseMarkup), 'input', 'select', + 'xmp', 'listing', 'plaintext', 'pre', +}; + # xmp, listing, plaintext, and pre are untightenable, and + # in a really special way. +@canTighten{'hr','br'} = (1,1); + # exceptional 'phrasal' things that ARE subject to tightening. + +# The one case where I can think of my tightening rules failing is: +#

foo bar

baz quux ... +# ^-- that would get deleted. +# But that's pretty gruesome code anyhow. You gets what you pays for. + +#========================================================================== + +=head2 array @HTML::Tagset::p_closure_barriers + +This array has a meaning that I have only seen a need for in +C, but I include it here on the off chance that someone +might find it of use: + +When we see a "EpE" token, we go lookup up the lineage for a p +element we might have to minimize. At first sight, we might say that +if there's a p anywhere in the lineage of this new p, it should be +closed. But that's wrong. Consider this document: + + + + foo + + +

foo + + + + +
+ foo +

bar +

+

+ + + +The second p is quite legally inside a much higher p. + +My formalization of the reason why this is legal, but this: + +

foo

bar

+ +isn't, is that something about the table constitutes a "barrier" to +the application of the rule about what p must minimize. + +So C<@HTML::Tagset::p_closure_barriers> is the list of all such +barrier-tags. + +=cut + +@p_closure_barriers = qw( + li blockquote + ul ol menu dir + dl dt dd + td th tr table caption + div + ); + +# In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this +# monkey business of barriers to minimization! + +=head2 hashset %isCDATA_Parent + +This hashset includes all elements whose content is CDATA. + +=cut + +%isCDATA_Parent = map {; $_ => 1 } + qw(script style xmp listing plaintext); + +# TODO: there's nothing else that takes CDATA children, right? + +# As the HTML3 DTD (Raggett 1995-04-24) noted: +# The XMP, LISTING and PLAINTEXT tags are incompatible with SGML +# and derive from very early versions of HTML. They require non- +# standard parsers and will cause problems for processing +# documents with standard SGML tools. + + +=head1 CAVEATS + +You may find it useful to alter the behavior of modules (like +C or C) that use C's +data tables by altering the data tables themselves. You are welcome +to try, but be careful; and be aware that different modules may or may +react differently to the data tables being changed. + +Note that it may be inappropriate to use these tables for I +HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames +for all elements that can appear either in the head or in the body, +such as "script". That doesn't mean that I am saying your code that +produces HTML should feel free to put script elements in either place! +If you are producing programs that spit out HTML, you should be +I familiar with the DTDs for HTML or XHTML (available at +C), and you should slavishly obey them, not +the data tables in this document. + +=head1 SEE ALSO + +L, L, L + +=head1 COPYRIGHT & LICENSE + +Copyright 1995-2000 Gisle Aas. + +Copyright 2000-2005 Sean M. Burke. + +Copyright 2005-2008 Andy Lester. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=head1 ACKNOWLEDGEMENTS + +Most of the code/data in this module was adapted from code written +by Gisle Aas for C, C, and +C. Then it was maintained by Sean M. Burke. + +=head1 AUTHOR + +Current maintainer: Andy Lester, C<< >> + +=head1 BUGS + +Please report any bugs or feature requests to +C, or through the web interface at +L. I will +be notified, and then you'll automatically be notified of progress on +your bug as I make changes. + +=cut + +1; diff --git a/t/00_about_verbose.t b/t/00_about_verbose.t new file mode 100644 index 0000000..3278b58 --- /dev/null +++ b/t/00_about_verbose.t @@ -0,0 +1,85 @@ + +require 5; +# Time-stamp: "2004-12-29 20:55:15 AST" +# Summary of, well, things. + +use Test; +BEGIN {plan tests => 2}; +ok 1; + +use HTML::Tagset (); + +#chdir "t" if -e "t"; + +{ + my @out; + push @out, + "\n\nPerl v", + defined($^V) ? sprintf('%vd', $^V) : $], + " under $^O ", + (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber()) + ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (), + (defined $MacPerl::Version) + ? ("(MacPerl version $MacPerl::Version)") : (), + "\n" + ; + + # Ugly code to walk the symbol tables: + my %v; + my @stack = (''); # start out in %:: + my $this; + my $count = 0; + my $pref; + while(@stack) { + $this = shift @stack; + die "Too many packages?" if ++$count > 1000; + next if exists $v{$this}; + next if $this eq 'main'; # %main:: is %:: + + #print "Peeking at $this => ${$this . '::VERSION'}\n"; + + if(defined ${$this . '::VERSION'} ) { + $v{$this} = ${$this . '::VERSION'} + } elsif( + defined *{$this . '::ISA'} or defined &{$this . '::import'} + or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"}) + # If it has an ISA, an import, or any subs... + ) { + # It's a class/module with no version. + $v{$this} = undef; + } else { + # It's probably an unpopulated package. + ## $v{$this} = '...'; + } + + $pref = length($this) ? "$this\::" : ''; + push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'}; + #print "Stack: @stack\n"; + } + push @out, " Modules in memory:\n"; + delete @v{'', '[none]'}; + foreach my $p (sort {lc($a) cmp lc($b)} keys %v) { + $indent = ' ' x (2 + ($p =~ tr/:/:/)); + push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n"; + } + push @out, sprintf "[at %s (local) / %s (GMT)]\n", + scalar(gmtime), scalar(localtime); + my $x = join '', @out; + $x =~ s/^/#/mg; + print $x; +} + +print "# Running", + (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n", + "#\n", +; + +print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n"; + +print "# \%INC:\n"; +foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) { + print "# [$x] = [", $INC{$x} || '', "]\n"; +} + +ok 1; + diff --git a/t/01_old_junk.t b/t/01_old_junk.t new file mode 100644 index 0000000..a09080f --- /dev/null +++ b/t/01_old_junk.t @@ -0,0 +1,8 @@ + +# Time-stamp: "2004-12-29 18:49:45 AST" + +BEGIN { $| = 1; print "1..1\n"; } +END {print "not ok 1\n" unless $loaded;} +use HTML::Tagset; +$loaded = 1; +print "ok 1\n"; diff --git a/t/pod.t b/t/pod.t new file mode 100644 index 0000000..f3cc58b --- /dev/null +++ b/t/pod.t @@ -0,0 +1,6 @@ +#!perl -Tw + +use Test::More; +eval "use Test::Pod 1.14"; +plan skip_all => "Test::Pod 1.14 required for testing POD" if $@; +all_pod_files_ok();