Blame lib/Encode/Alias.pm

Packit d0f5c2
package Encode::Alias;
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
Packit d0f5c2
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
Packit d0f5c2
Packit d0f5c2
use Exporter 'import';
Packit d0f5c2
Packit d0f5c2
# Public, encouraged API is exported by default
Packit d0f5c2
Packit d0f5c2
our @EXPORT =
Packit d0f5c2
  qw (
Packit d0f5c2
  define_alias
Packit d0f5c2
  find_alias
Packit d0f5c2
);
Packit d0f5c2
Packit d0f5c2
our @Alias;    # ordered matching list
Packit d0f5c2
our %Alias;    # cached known aliases
Packit d0f5c2
Packit d0f5c2
sub find_alias {
Packit d0f5c2
    my $class = shift;
Packit d0f5c2
    my $find  = shift;
Packit d0f5c2
    unless ( exists $Alias{$find} ) {
Packit d0f5c2
        $Alias{$find} = undef;    # Recursion guard
Packit d0f5c2
        for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
Packit d0f5c2
            my $alias = $Alias[$i];
Packit d0f5c2
            my $val   = $Alias[ $i + 1 ];
Packit d0f5c2
            my $new;
Packit d0f5c2
            if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
Packit d0f5c2
                DEBUG and warn "eval $val";
Packit d0f5c2
                $new = eval $val;
Packit d0f5c2
                DEBUG and $@ and warn "$val, $@";
Packit d0f5c2
            }
Packit d0f5c2
            elsif ( ref($alias) eq 'CODE' ) {
Packit d0f5c2
                DEBUG and warn "$alias", "->", "($find)";
Packit d0f5c2
                $new = $alias->($find);
Packit d0f5c2
            }
Packit d0f5c2
            elsif ( lc($find) eq lc($alias) ) {
Packit d0f5c2
                $new = $val;
Packit d0f5c2
            }
Packit d0f5c2
            if ( defined($new) ) {
Packit d0f5c2
                next if $new eq $find;    # avoid (direct) recursion on bugs
Packit d0f5c2
                DEBUG and warn "$alias, $new";
Packit d0f5c2
                my $enc =
Packit d0f5c2
                  ( ref($new) ) ? $new : Encode::find_encoding($new);
Packit d0f5c2
                if ($enc) {
Packit d0f5c2
                    $Alias{$find} = $enc;
Packit d0f5c2
                    last;
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
Packit d0f5c2
        # case insensitive search when canonical is not in all lowercase
Packit d0f5c2
        # RT ticket #7835
Packit d0f5c2
        unless ( $Alias{$find} ) {
Packit d0f5c2
            my $lcfind = lc($find);
Packit d0f5c2
            for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
Packit d0f5c2
            {
Packit d0f5c2
                $lcfind eq lc($name) or next;
Packit d0f5c2
                $Alias{$find} = Encode::find_encoding($name);
Packit d0f5c2
                DEBUG and warn "$find => $name";
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    if (DEBUG) {
Packit d0f5c2
        my $name;
Packit d0f5c2
        if ( my $e = $Alias{$find} ) {
Packit d0f5c2
            $name = $e->name;
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            $name = "";
Packit d0f5c2
        }
Packit d0f5c2
        warn "find_alias($class, $find)->name = $name";
Packit d0f5c2
    }
Packit d0f5c2
    return $Alias{$find};
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub define_alias {
Packit d0f5c2
    while (@_) {
Packit d0f5c2
        my $alias = shift;
Packit d0f5c2
        my $name = shift;
Packit d0f5c2
        unshift( @Alias, $alias => $name )    # newer one has precedence
Packit d0f5c2
            if defined $alias;
Packit d0f5c2
        if ( ref($alias) ) {
Packit d0f5c2
Packit d0f5c2
            # clear %Alias cache to allow overrides
Packit d0f5c2
            my @a = keys %Alias;
Packit d0f5c2
            for my $k (@a) {
Packit d0f5c2
                if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
Packit d0f5c2
                    DEBUG and warn "delete \$Alias\{$k\}";
Packit d0f5c2
                    delete $Alias{$k};
Packit d0f5c2
                }
Packit d0f5c2
                elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
Packit d0f5c2
                    DEBUG and warn "delete \$Alias\{$k\}";
Packit d0f5c2
                    delete $Alias{$k};
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        elsif (defined $alias) {
Packit d0f5c2
            DEBUG and warn "delete \$Alias\{$alias\}";
Packit d0f5c2
            delete $Alias{$alias};
Packit d0f5c2
        }
Packit d0f5c2
        elsif (DEBUG) {
Packit d0f5c2
            require Carp;
Packit d0f5c2
            Carp::croak("undef \$alias");
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias
Packit d0f5c2
use Encode ();
Packit d0f5c2
Packit d0f5c2
# Allow latin-1 style names as well
Packit d0f5c2
# 0  1  2  3  4  5   6   7   8   9  10
Packit d0f5c2
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
Packit d0f5c2
Packit d0f5c2
# Allow winlatin1 style names as well
Packit d0f5c2
our %Winlatin2cp = (
Packit d0f5c2
    'latin1'     => 1252,
Packit d0f5c2
    'latin2'     => 1250,
Packit d0f5c2
    'cyrillic'   => 1251,
Packit d0f5c2
    'greek'      => 1253,
Packit d0f5c2
    'turkish'    => 1254,
Packit d0f5c2
    'hebrew'     => 1255,
Packit d0f5c2
    'arabic'     => 1256,
Packit d0f5c2
    'baltic'     => 1257,
Packit d0f5c2
    'vietnamese' => 1258,
Packit d0f5c2
);
Packit d0f5c2
Packit d0f5c2
init_aliases();
Packit d0f5c2
Packit d0f5c2
sub undef_aliases {
Packit d0f5c2
    @Alias = ();
Packit d0f5c2
    %Alias = ();
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub init_aliases {
Packit d0f5c2
    undef_aliases();
Packit d0f5c2
Packit d0f5c2
    # Try all-lower-case version should all else fails
Packit d0f5c2
    define_alias( qr/^(.*)$/ => '"\L$1"' );
Packit d0f5c2
Packit d0f5c2
    # UTF/UCS stuff
Packit d0f5c2
    define_alias( qr/^(unicode-1-1-)?UTF-?7$/i     => '"UTF-7"' );
Packit d0f5c2
    define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
Packit d0f5c2
    define_alias(
Packit d0f5c2
        qr/^UCS-?2-?(BE)?$/i    => '"UCS-2BE"',
Packit d0f5c2
        qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")',
Packit d0f5c2
        qr/^iso-10646-1$/i      => '"UCS-2BE"'
Packit d0f5c2
    );
Packit d0f5c2
    define_alias(
Packit d0f5c2
        qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
Packit d0f5c2
        qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
Packit d0f5c2
        qr/^UTF-?(16|32)$/i     => '"UTF-$1"',
Packit d0f5c2
    );
Packit d0f5c2
Packit d0f5c2
    # ASCII
Packit d0f5c2
    define_alias( qr/^(?:US-?)ascii$/i       => '"ascii"' );
Packit d0f5c2
    define_alias( 'C'                        => 'ascii' );
Packit d0f5c2
    define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
Packit d0f5c2
Packit d0f5c2
    # Allow variants of iso-8859-1 etc.
Packit d0f5c2
    define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
Packit d0f5c2
Packit d0f5c2
    # At least HP-UX has these.
Packit d0f5c2
    define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
Packit d0f5c2
Packit d0f5c2
    # More HP stuff.
Packit d0f5c2
    define_alias(
Packit d0f5c2
        qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
Packit d0f5c2
          '"${1}8"' );
Packit d0f5c2
Packit d0f5c2
    # The Official name of ASCII.
Packit d0f5c2
    define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
Packit d0f5c2
Packit d0f5c2
    # This is a font issue, not an encoding issue.
Packit d0f5c2
    # (The currency symbol of the Latin 1 upper half
Packit d0f5c2
    #  has been redefined as the euro symbol.)
Packit d0f5c2
    define_alias( qr/^(.+)\@euro$/i => '"$1"' );
Packit d0f5c2
Packit d0f5c2
    define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
Packit d0f5c2
'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
Packit d0f5c2
    );
Packit d0f5c2
Packit d0f5c2
    define_alias(
Packit d0f5c2
        qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
Packit d0f5c2
             hebrew|arabic|baltic|vietnamese)$/ix =>
Packit d0f5c2
          '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
Packit d0f5c2
    );
Packit d0f5c2
Packit d0f5c2
    # Common names for non-latin preferred MIME names
Packit d0f5c2
    define_alias(
Packit d0f5c2
        'ascii'    => 'US-ascii',
Packit d0f5c2
        'cyrillic' => 'iso-8859-5',
Packit d0f5c2
        'arabic'   => 'iso-8859-6',
Packit d0f5c2
        'greek'    => 'iso-8859-7',
Packit d0f5c2
        'hebrew'   => 'iso-8859-8',
Packit d0f5c2
        'thai'     => 'iso-8859-11',
Packit d0f5c2
    );
Packit d0f5c2
    # RT #20781
Packit d0f5c2
    define_alias(qr/\btis-?620\b/i  => '"iso-8859-11"');
Packit d0f5c2
Packit d0f5c2
    # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
Packit d0f5c2
    # And Microsoft has their own naming (again, surprisingly).
Packit d0f5c2
    # And windows-* is registered in IANA!
Packit d0f5c2
    define_alias(
Packit d0f5c2
        qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
Packit d0f5c2
Packit d0f5c2
    # Sometimes seen with a leading zero.
Packit d0f5c2
    # define_alias( qr/\bcp037\b/i => '"cp37"');
Packit d0f5c2
Packit d0f5c2
    # Mac Mappings
Packit d0f5c2
    # predefined in *.ucm; unneeded
Packit d0f5c2
    # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
Packit d0f5c2
    define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
Packit d0f5c2
    # http://rt.cpan.org/Ticket/Display.html?id=36326
Packit d0f5c2
    define_alias( qr/^macintosh$/i => '"MacRoman"' );
Packit d0f5c2
    # https://rt.cpan.org/Ticket/Display.html?id=78125
Packit d0f5c2
    define_alias( qr/^macce$/i => '"MacCentralEurRoman"' );
Packit d0f5c2
    # Ououououou. gone.  They are different!
Packit d0f5c2
    # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
Packit d0f5c2
Packit d0f5c2
    # Standardize on the dashed versions.
Packit d0f5c2
    define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
Packit d0f5c2
Packit d0f5c2
    unless ($Encode::ON_EBCDIC) {
Packit d0f5c2
Packit d0f5c2
        # for Encode::CN
Packit d0f5c2
        define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
Packit d0f5c2
        define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
Packit d0f5c2
Packit d0f5c2
        # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
Packit d0f5c2
        # CP936 doesn't have vendor-addon for GBK, so they're identical.
Packit d0f5c2
        define_alias( qr/^gbk$/i => '"cp936"' );
Packit d0f5c2
Packit d0f5c2
        # This fixes gb2312 vs. euc-cn confusion, practically
Packit d0f5c2
        define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
Packit d0f5c2
Packit d0f5c2
        # for Encode::JP
Packit d0f5c2
        define_alias( qr/\bjis$/i         => '"7bit-jis"' );
Packit d0f5c2
        define_alias( qr/\beuc.*jp$/i     => '"euc-jp"' );
Packit d0f5c2
        define_alias( qr/\bjp.*euc$/i     => '"euc-jp"' );
Packit d0f5c2
        define_alias( qr/\bujis$/i        => '"euc-jp"' );
Packit d0f5c2
        define_alias( qr/\bshift.*jis$/i  => '"shiftjis"' );
Packit d0f5c2
        define_alias( qr/\bsjis$/i        => '"shiftjis"' );
Packit d0f5c2
        define_alias( qr/\bwindows-31j$/i => '"cp932"' );
Packit d0f5c2
Packit d0f5c2
        # for Encode::KR
Packit d0f5c2
        define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
Packit d0f5c2
        define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
Packit d0f5c2
Packit d0f5c2
        # This fixes ksc5601 vs. euc-kr confusion, practically
Packit d0f5c2
        define_alias( qr/(?:x-)?uhc$/i         => '"cp949"' );
Packit d0f5c2
        define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
Packit d0f5c2
        define_alias( qr/\bks_c_5601-1987$/i   => '"cp949"' );
Packit d0f5c2
Packit d0f5c2
        # for Encode::TW
Packit d0f5c2
        define_alias( qr/\bbig-?5$/i              => '"big5-eten"' );
Packit d0f5c2
        define_alias( qr/\bbig5-?et(?:en)?$/i     => '"big5-eten"' );
Packit d0f5c2
        define_alias( qr/\btca[-_]?big5$/i        => '"big5-eten"' );
Packit d0f5c2
        define_alias( qr/\bbig5-?hk(?:scs)?$/i    => '"big5-hkscs"' );
Packit d0f5c2
        define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    # https://github.com/dankogai/p5-encode/issues/37
Packit d0f5c2
    define_alias(qr/cp65000/i => '"UTF-7"');
Packit d0f5c2
    define_alias(qr/cp65001/i => '"utf-8-strict"');
Packit d0f5c2
Packit d0f5c2
    # utf8 is blessed :)
Packit d0f5c2
    define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
Packit d0f5c2
Packit d0f5c2
    # At last, Map white space and _ to '-'
Packit d0f5c2
    define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' );
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
1;
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
Packit d0f5c2
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
Packit d0f5c2
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
Packit d0f5c2
# TODO: Armenian encoding ARMSCII-8
Packit d0f5c2
# TODO: Hebrew encoding ISO-8859-8-1
Packit d0f5c2
# TODO: Thai encoding TCVN
Packit d0f5c2
# TODO: Vietnamese encodings VPS
Packit d0f5c2
# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
Packit d0f5c2
#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
Packit d0f5c2
#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
Packit d0f5c2
#       Kannada Khmer Korean Laotian Malayalam Mongolian
Packit d0f5c2
#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
Encode::Alias - alias definitions to encodings
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  use Encode::Alias;
Packit d0f5c2
  define_alias( "newName" => ENCODING);
Packit d0f5c2
  define_alias( qr/.../ => ENCODING);
Packit d0f5c2
  define_alias( sub { return ENCODING if ...; } );
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
Allows newName to be used as an alias for ENCODING. ENCODING may be
Packit d0f5c2
either the name of an encoding or an encoding object (as described 
Packit d0f5c2
in L<Encode>).
Packit d0f5c2
Packit d0f5c2
Currently the first argument to define_alias() can be specified in the
Packit d0f5c2
following ways:
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item As a simple string.
Packit d0f5c2
Packit d0f5c2
=item As a qr// compiled regular expression, e.g.:
Packit d0f5c2
Packit d0f5c2
  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
Packit d0f5c2
Packit d0f5c2
In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
Packit d0f5c2
in order to allow C<$1> etc. to be substituted.  The example is one
Packit d0f5c2
way to alias names as used in X11 fonts to the MIME names for the
Packit d0f5c2
iso-8859-* family.  Note the double quotes inside the single quotes.
Packit d0f5c2
Packit d0f5c2
(or, you don't have to do this yourself because this example is predefined)
Packit d0f5c2
Packit d0f5c2
If you are using a regex here, you have to use the quotes as shown or
Packit d0f5c2
it won't work.  Also note that regex handling is tricky even for the
Packit d0f5c2
experienced.  Use this feature with caution.
Packit d0f5c2
Packit d0f5c2
=item As a code reference, e.g.:
Packit d0f5c2
Packit d0f5c2
  define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
Packit d0f5c2
Packit d0f5c2
The same effect as the example above in a different way.  The coderef
Packit d0f5c2
takes the alias name as an argument and returns a canonical name on
Packit d0f5c2
success or undef if not.  Note the second argument is ignored if provided.
Packit d0f5c2
Use this with even more caution than the regex version.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head3 Changes in code reference aliasing
Packit d0f5c2
Packit d0f5c2
As of Encode 1.87, the older form
Packit d0f5c2
Packit d0f5c2
  define_alias( sub { return  /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
Packit d0f5c2
Packit d0f5c2
no longer works. 
Packit d0f5c2
Packit d0f5c2
Encode up to 1.86 internally used "local $_" to implement this older
Packit d0f5c2
form.  But consider the code below;
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  $_ = "eeeee" ;
Packit d0f5c2
  while (/(e)/g) {
Packit d0f5c2
    my $utf = decode('aliased-encoding-name', $1);
Packit d0f5c2
    print "position:",pos,"\n";
Packit d0f5c2
  }
Packit d0f5c2
Packit d0f5c2
Prior to Encode 1.86 this fails because of "local $_".
Packit d0f5c2
Packit d0f5c2
=head2 Alias overloading
Packit d0f5c2
Packit d0f5c2
You can override predefined aliases by simply applying define_alias().
Packit d0f5c2
The new alias is always evaluated first, and when necessary,
Packit d0f5c2
define_alias() flushes the internal cache to make the new definition
Packit d0f5c2
available.
Packit d0f5c2
Packit d0f5c2
  # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
Packit d0f5c2
  # superset of SHIFT_JIS
Packit d0f5c2
Packit d0f5c2
  define_alias( qr/shift.*jis$/i  => '"cp932"' );
Packit d0f5c2
  define_alias( qr/sjis$/i        => '"cp932"' );
Packit d0f5c2
Packit d0f5c2
If you want to zap all predefined aliases, you can use
Packit d0f5c2
Packit d0f5c2
  Encode::Alias->undef_aliases;
Packit d0f5c2
Packit d0f5c2
to do so.  And
Packit d0f5c2
Packit d0f5c2
  Encode::Alias->init_aliases;
Packit d0f5c2
Packit d0f5c2
gets the factory settings back.
Packit d0f5c2
Packit d0f5c2
Note that define_alias() will not be able to override the canonical name
Packit d0f5c2
of encodings. Encodings are first looked up by canonical name before
Packit d0f5c2
potential aliases are tried.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode>, L<Encode::Supported>
Packit d0f5c2
Packit d0f5c2
=cut
Packit d0f5c2