Blame encoding.pm

Packit d0f5c2
# $Id: encoding.pm,v 2.22 2018/02/11 05:32:03 dankogai Exp $
Packit d0f5c2
package encoding;
Packit d0f5c2
our $VERSION = sprintf "%d.%02d", q$Revision: 2.22 $ =~ /(\d+)/g;
Packit d0f5c2
Packit d0f5c2
use Encode;
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use Config;
Packit d0f5c2
Packit d0f5c2
use constant {
Packit d0f5c2
    DEBUG => !!$ENV{PERL_ENCODE_DEBUG},
Packit d0f5c2
    HAS_PERLIO => eval { require PerlIO::encoding; PerlIO::encoding->VERSION(0.02) },
Packit d0f5c2
    PERL_5_21_7 => $^V && $^V ge v5.21.7, # lexically scoped
Packit d0f5c2
};
Packit d0f5c2
Packit d0f5c2
sub _exception {
Packit d0f5c2
    my $name = shift;
Packit d0f5c2
    $] > 5.008 and return 0;    # 5.8.1 or higher then no
Packit d0f5c2
    my %utfs = map { $_ => 1 }
Packit d0f5c2
      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
Packit d0f5c2
      UTF-32 UTF-32BE UTF-32LE);
Packit d0f5c2
    $utfs{$name} or return 0;    # UTFs or no
Packit d0f5c2
    require Config;
Packit d0f5c2
    Config->import();
Packit d0f5c2
    our %Config;
Packit d0f5c2
    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
Packit d0f5c2
Packit d0f5c2
sub _get_locale_encoding {
Packit d0f5c2
    my $locale_encoding;
Packit d0f5c2
Packit d0f5c2
    if ($^O eq 'MSWin32') {
Packit d0f5c2
        my @tries = (
Packit d0f5c2
            # First try to get the OutputCP. This will work only if we
Packit d0f5c2
            # are attached to a console
Packit d0f5c2
            'Win32.pm' => 'Win32::GetConsoleOutputCP',
Packit d0f5c2
            'Win32/Console.pm' => 'Win32::Console::OutputCP',
Packit d0f5c2
            # If above failed, this means that we are a GUI app
Packit d0f5c2
            # Let's assume that the ANSI codepage is what matters
Packit d0f5c2
            'Win32.pm' => 'Win32::GetACP',
Packit d0f5c2
        );
Packit d0f5c2
        while (@tries) {
Packit d0f5c2
            my $cp = eval {
Packit d0f5c2
                require $tries[0];
Packit d0f5c2
                no strict 'refs';
Packit d0f5c2
                &{$tries[1]}()
Packit d0f5c2
            };
Packit d0f5c2
            if ($cp) {
Packit d0f5c2
                if ($cp == 65001) { # Code page for UTF-8
Packit d0f5c2
                    $locale_encoding = 'UTF-8';
Packit d0f5c2
                } else {
Packit d0f5c2
                    $locale_encoding = 'cp' . $cp;
Packit d0f5c2
                }
Packit d0f5c2
                return $locale_encoding;
Packit d0f5c2
            }
Packit d0f5c2
            splice(@tries, 0, 2)
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    # I18N::Langinfo isn't available everywhere
Packit d0f5c2
    $locale_encoding = eval {
Packit d0f5c2
        require I18N::Langinfo;
Packit d0f5c2
        find_encoding(
Packit d0f5c2
            I18N::Langinfo::langinfo( I18N::Langinfo::CODESET() )
Packit d0f5c2
        )->name
Packit d0f5c2
    };
Packit d0f5c2
    return $locale_encoding if defined $locale_encoding;
Packit d0f5c2
Packit d0f5c2
    eval {
Packit d0f5c2
        require POSIX;
Packit d0f5c2
        # Get the current locale
Packit d0f5c2
        # Remember that MSVCRT impl is quite different from Unixes
Packit d0f5c2
        my $locale = POSIX::setlocale(POSIX::LC_CTYPE());
Packit d0f5c2
        if ( $locale =~ /^([^.]+)\.([^.@]+)(?:@.*)?$/ ) {
Packit d0f5c2
            my $country_language;
Packit d0f5c2
            ( $country_language, $locale_encoding ) = ( $1, $2 );
Packit d0f5c2
Packit d0f5c2
            # Could do more heuristics based on the country and language
Packit d0f5c2
            # since we have Locale::Country and Locale::Language available.
Packit d0f5c2
            # TODO: get a database of Language -> Encoding mappings
Packit d0f5c2
            # (the Estonian database at http://www.eki.ee/letter/
Packit d0f5c2
            # would be excellent!) --jhi
Packit d0f5c2
            if (lc($locale_encoding) eq 'euc') {
Packit d0f5c2
                if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
Packit d0f5c2
                    $locale_encoding = 'euc-jp';
Packit d0f5c2
                }
Packit d0f5c2
                elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
Packit d0f5c2
                    $locale_encoding = 'euc-kr';
Packit d0f5c2
                }
Packit d0f5c2
                elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
Packit d0f5c2
                    $locale_encoding = 'euc-cn';
Packit d0f5c2
                }
Packit d0f5c2
                elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
Packit d0f5c2
                    $locale_encoding = 'euc-tw';
Packit d0f5c2
                }
Packit d0f5c2
                else {
Packit d0f5c2
                    require Carp;
Packit d0f5c2
                    Carp::croak(
Packit d0f5c2
                        "encoding: Locale encoding '$locale_encoding' too ambiguous"
Packit d0f5c2
                    );
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
    };
Packit d0f5c2
Packit d0f5c2
    return $locale_encoding;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub import {
Packit d0f5c2
Packit d0f5c2
    if ( ord("A") == 193 ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("encoding: pragma does not support EBCDIC platforms");
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    my $deprecate =
Packit d0f5c2
        ($] >= 5.017 and !$Config{usecperl})
Packit d0f5c2
        ? "Use of the encoding pragma is deprecated" : 0;
Packit d0f5c2
Packit d0f5c2
    my $class = shift;
Packit d0f5c2
    my $name  = shift;
Packit d0f5c2
    if (!$name){
Packit d0f5c2
	require Carp;
Packit d0f5c2
        Carp::croak("encoding: no encoding specified.");
Packit d0f5c2
    }
Packit d0f5c2
    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
Packit d0f5c2
        my $caller = caller();
Packit d0f5c2
        {
Packit d0f5c2
            no strict 'refs';
Packit d0f5c2
            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
Packit d0f5c2
        }
Packit d0f5c2
        return;
Packit d0f5c2
    }
Packit d0f5c2
    $name = _get_locale_encoding() if $name eq ':locale';
Packit d0f5c2
    BEGIN { strict->unimport('hashpairs') if $] >= 5.027 and $^V =~ /c$/; }
Packit d0f5c2
    my %arg = @_;
Packit d0f5c2
    $name = $ENV{PERL_ENCODING} unless defined $name;
Packit d0f5c2
    my $enc = find_encoding($name);
Packit d0f5c2
    unless ( defined $enc ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("encoding: Unknown encoding '$name'");
Packit d0f5c2
    }
Packit d0f5c2
    $name = $enc->name;    # canonize
Packit d0f5c2
    unless ( $arg{Filter} ) {
Packit d0f5c2
        if ($] >= 5.025003 and !$Config{usecperl}) {
Packit d0f5c2
            require Carp;
Packit d0f5c2
            Carp::croak("The encoding pragma is no longer supported. Check cperl");
Packit d0f5c2
        }
Packit d0f5c2
        warnings::warnif("deprecated",$deprecate) if $deprecate;
Packit d0f5c2
Packit d0f5c2
        DEBUG and warn "_exception($name) = ", _exception($name);
Packit d0f5c2
        if (! _exception($name)) {
Packit d0f5c2
            if (!PERL_5_21_7) {
Packit d0f5c2
                ${^ENCODING} = $enc;
Packit d0f5c2
            }
Packit d0f5c2
            else {
Packit d0f5c2
                # Starting with 5.21.7, this pragma uses a shadow variable
Packit d0f5c2
                # designed explicitly for it, ${^E_NCODING}, to enforce
Packit d0f5c2
                # lexical scope; instead of ${^ENCODING}.
Packit d0f5c2
                $^H{'encoding'} = 1;
Packit d0f5c2
                ${^E_NCODING} = $enc;
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        if (! HAS_PERLIO ) {
Packit d0f5c2
            return 1;
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        warnings::warnif("deprecated",$deprecate) if $deprecate;
Packit d0f5c2
Packit d0f5c2
        defined( ${^ENCODING} ) and undef ${^ENCODING};
Packit d0f5c2
        undef ${^E_NCODING} if PERL_5_21_7;
Packit d0f5c2
Packit d0f5c2
        # implicitly 'use utf8'
Packit d0f5c2
        require utf8;      # to fetch $utf8::hint_bits;
Packit d0f5c2
        $^H |= $utf8::hint_bits;
Packit d0f5c2
Packit d0f5c2
            require Filter::Util::Call;
Packit d0f5c2
            Filter::Util::Call->import;
Packit d0f5c2
            filter_add(
Packit d0f5c2
                sub {
Packit d0f5c2
                    my $status = filter_read();
Packit d0f5c2
                    if ( $status > 0 ) {
Packit d0f5c2
                        $_ = $enc->decode( $_, 1 );
Packit d0f5c2
                        DEBUG and warn $_;
Packit d0f5c2
                    }
Packit d0f5c2
                    $status;
Packit d0f5c2
                }
Packit d0f5c2
            );
Packit d0f5c2
    }
Packit d0f5c2
    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
Packit d0f5c2
    for my $h (qw(STDIN STDOUT)) {
Packit d0f5c2
        if ( $arg{$h} ) {
Packit d0f5c2
            unless ( defined find_encoding( $arg{$h} ) ) {
Packit d0f5c2
                require Carp;
Packit d0f5c2
                Carp::croak(
Packit d0f5c2
                    "encoding: Unknown encoding for $h, '$arg{$h}'");
Packit d0f5c2
            }
Packit d0f5c2
            binmode( $h, ":raw :encoding($arg{$h})" );
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            unless ( exists $arg{$h} ) {
Packit d0f5c2
                    no warnings 'uninitialized';
Packit d0f5c2
                    binmode( $h, ":raw :encoding($name)" );
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    return 1;    # I doubt if we need it, though
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub unimport {
Packit d0f5c2
    no warnings;
Packit d0f5c2
    undef ${^ENCODING};
Packit d0f5c2
    undef ${^E_NCODING} if PERL_5_21_7;
Packit d0f5c2
    if (HAS_PERLIO) {
Packit d0f5c2
        binmode( STDIN,  ":raw" );
Packit d0f5c2
        binmode( STDOUT, ":raw" );
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        binmode(STDIN);
Packit d0f5c2
        binmode(STDOUT);
Packit d0f5c2
    }
Packit d0f5c2
    if ( $INC{"Filter/Util/Call.pm"} ) {
Packit d0f5c2
        eval { filter_del() };
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
1;
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=pod
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
encoding - allows you to write your script in non-ASCII and non-UTF-8
Packit d0f5c2
Packit d0f5c2
=head1 WARNING
Packit d0f5c2
Packit d0f5c2
This module has been deprecated since perl v5.18.  See L</DESCRIPTION> and
Packit d0f5c2
L</BUGS>.
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  use encoding "greek";  # Perl like Greek to you?
Packit d0f5c2
  use encoding "euc-jp"; # Jperl!
Packit d0f5c2
Packit d0f5c2
  # or you can even do this if your shell supports your native encoding
Packit d0f5c2
Packit d0f5c2
  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
Packit d0f5c2
  perl -Mencoding=euc-kr -e'...' # Or Korean?
Packit d0f5c2
Packit d0f5c2
  # more control
Packit d0f5c2
Packit d0f5c2
  # A simple euc-cn => utf-8 converter
Packit d0f5c2
  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
Packit d0f5c2
Packit d0f5c2
  # "no encoding;" supported
Packit d0f5c2
  no encoding;
Packit d0f5c2
Packit d0f5c2
  # an alternate way, Filter
Packit d0f5c2
  use encoding "euc-jp", Filter=>1;
Packit d0f5c2
  # now you can use kanji identifiers -- in euc-jp!
Packit d0f5c2
Packit d0f5c2
  # encode based on the current locale - specialized purposes only;
Packit d0f5c2
  # fraught with danger!!
Packit d0f5c2
  use encoding ':locale';
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
This pragma is used to enable a Perl script to be written in encodings that
Packit d0f5c2
aren't strictly ASCII nor UTF-8.  It translates all or portions of the Perl
Packit d0f5c2
program script from a given encoding into UTF-8, and changes the PerlIO layers
Packit d0f5c2
of C<STDIN> and C<STDOUT> to the encoding specified.
Packit d0f5c2
Packit d0f5c2
This pragma dates from the days when UTF-8-enabled editors were uncommon.  But
Packit d0f5c2
that was long ago, and the need for it is greatly diminished.  That, coupled
Packit d0f5c2
with the fact that it doesn't work with threads, along with other problems,
Packit d0f5c2
(see L</BUGS>) have led to its being deprecated.  It is planned to remove this
Packit d0f5c2
pragma in a future Perl version.  New code should be written in UTF-8, and the
Packit d0f5c2
C<use utf8> pragma used instead (see L<perluniintro> and L<utf8> for details).
Packit d0f5c2
Old code should be converted to UTF-8, via something like the recipe in the
Packit d0f5c2
L</SYNOPSIS> (though this simple approach may require manual adjustments
Packit d0f5c2
afterwards).
Packit d0f5c2
Packit d0f5c2
If UTF-8 is not an option, it is recommended that one use a simple source
Packit d0f5c2
filter, such as that provided by L<Filter::Encoding> on CPAN or this
Packit d0f5c2
pragma's own C<Filter> option (see below).
Packit d0f5c2
Packit d0f5c2
The only legitimate use of this pragma is almost certainly just one per file,
Packit d0f5c2
near the top, with file scope, as the file is likely going to only be written
Packit d0f5c2
in one encoding.  Further restrictions apply in Perls before v5.22 (see
Packit d0f5c2
L</Prior to Perl v5.22>).
Packit d0f5c2
Packit d0f5c2
There are two basic modes of operation (plus turning if off):
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item C<use encoding ['I<ENCNAME>'] ;>
Packit d0f5c2
Packit d0f5c2
Please note: This mode of operation is no longer supported as of Perl
Packit d0f5c2
v5.26.
Packit d0f5c2
Packit d0f5c2
This is the normal operation.  It translates various literals encountered in
Packit d0f5c2
the Perl source file from the encoding I<ENCNAME> into UTF-8, and similarly
Packit d0f5c2
converts character code points.  This is used when the script is a combination
Packit d0f5c2
of ASCII (for the variable names and punctuation, I<etc>), but the literal
Packit d0f5c2
data is in the specified encoding.
Packit d0f5c2
Packit d0f5c2
I<ENCNAME> is optional.  If omitted, the encoding specified in the environment
Packit d0f5c2
variable L<C<PERL_ENCODING>|perlrun/PERL_ENCODING> is used.  If this isn't
Packit d0f5c2
set, or the resolved-to encoding is not known to C<L<Encode>>, the error
Packit d0f5c2
C<Unknown encoding 'I<ENCNAME>'> will be thrown.
Packit d0f5c2
Packit d0f5c2
Starting in Perl v5.8.6 (C<Encode> version 2.0.1), I<ENCNAME> may be the
Packit d0f5c2
name C<:locale>.  This is for very specialized applications, and is documented
Packit d0f5c2
in L</The C<:locale> sub-pragma> below.
Packit d0f5c2
Packit d0f5c2
The literals that are converted are C<q//, qq//, qr//, qw///, qx//>, and
Packit d0f5c2
starting in v5.8.1, C.  Operations that do conversions include C<chr>,
Packit d0f5c2
C<ord>, C<utf8::upgrade> (but not C<utf8::downgrade>), and C<chomp>.
Packit d0f5c2
Packit d0f5c2
Also starting in v5.8.1, the C<DATA> pseudo-filehandle is translated from the
Packit d0f5c2
encoding into UTF-8.
Packit d0f5c2
Packit d0f5c2
For example, you can write code in EUC-JP as follows:
Packit d0f5c2
Packit d0f5c2
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
Packit d0f5c2
               #<-char-><-char->   # 4 octets
Packit d0f5c2
  s/\bCamel\b/$Rakuda/;
Packit d0f5c2
Packit d0f5c2
And with C<use encoding "euc-jp"> in effect, it is the same thing as
Packit d0f5c2
that code in UTF-8:
Packit d0f5c2
Packit d0f5c2
  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
Packit d0f5c2
  s/\bCamel\b/$Rakuda/;
Packit d0f5c2
Packit d0f5c2
See L</EXAMPLE> below for a more complete example.
Packit d0f5c2
Packit d0f5c2
Unless C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero, the
Packit d0f5c2
PerlIO layers of C<STDIN> and C<STDOUT> are set to "C<:encoding(I<ENCNAME>)>".
Packit d0f5c2
Therefore,
Packit d0f5c2
Packit d0f5c2
  use encoding "euc-jp";
Packit d0f5c2
  my $message = "Camel is the symbol of perl.\n";
Packit d0f5c2
  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
Packit d0f5c2
  $message =~ s/\bCamel\b/$Rakuda/;
Packit d0f5c2
  print $message;
Packit d0f5c2
Packit d0f5c2
will print
Packit d0f5c2
Packit d0f5c2
 "\xF1\xD1\xF1\xCC is the symbol of perl.\n"
Packit d0f5c2
Packit d0f5c2
not
Packit d0f5c2
Packit d0f5c2
 "\x{99F1}\x{99DD} is the symbol of perl.\n"
Packit d0f5c2
Packit d0f5c2
You can override this by giving extra arguments; see below.
Packit d0f5c2
Packit d0f5c2
Note that C<STDERR> WILL NOT be changed, regardless.
Packit d0f5c2
Packit d0f5c2
Also note that non-STD file handles remain unaffected.  Use C
Packit d0f5c2
open> or C<binmode> to change the layers of those.
Packit d0f5c2
Packit d0f5c2
=item C<use encoding I<ENCNAME>, Filter=E<gt>1;>
Packit d0f5c2
Packit d0f5c2
This operates as above, but the C<Filter> argument with a non-zero
Packit d0f5c2
value causes the entire script, and not just literals, to be translated from
Packit d0f5c2
the encoding into UTF-8.  This allows identifiers in the source to be in that
Packit d0f5c2
encoding as well.  (Problems may occur if the encoding is not a superset of
Packit d0f5c2
ASCII; imagine all your semi-colons being translated into something
Packit d0f5c2
different.)  One can use this form to make
Packit d0f5c2
Packit d0f5c2
 ${"\x{4eba}"}++
Packit d0f5c2
Packit d0f5c2
work.  (This is equivalent to C<$I<human>++>, where I<human> is a single Han
Packit d0f5c2
ideograph).
Packit d0f5c2
Packit d0f5c2
This effectively means that your source code behaves as if it were written in
Packit d0f5c2
UTF-8 with C<'use utf8>' in effect.  So even if your editor only supports
Packit d0f5c2
Shift_JIS, for example, you can still try examples in Chapter 15 of
Packit d0f5c2
C<Programming Perl, 3rd Ed.>.
Packit d0f5c2
Packit d0f5c2
This option is significantly slower than the other one.
Packit d0f5c2
Packit d0f5c2
=item C<no encoding;>
Packit d0f5c2
Packit d0f5c2
Unsets the script encoding. The layers of C<STDIN>, C<STDOUT> are
Packit d0f5c2
reset to "C<:raw>" (the default unprocessed raw stream of bytes).
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 OPTIONS
Packit d0f5c2
Packit d0f5c2
=head2 Setting C<STDIN> and/or C<STDOUT> individually
Packit d0f5c2
Packit d0f5c2
The encodings of C<STDIN> and C<STDOUT> are individually settable by parameters to
Packit d0f5c2
the pragma:
Packit d0f5c2
Packit d0f5c2
 use encoding 'euc-tw', STDIN => 'greek'  ...;
Packit d0f5c2
Packit d0f5c2
In this case, you cannot omit the first I<ENCNAME>.  C<< STDIN => undef >>
Packit d0f5c2
turns the I/O transcoding completely off for that filehandle.
Packit d0f5c2
Packit d0f5c2
When C<${^UNICODE}> (available starting in v5.8.2) exists and is non-zero,
Packit d0f5c2
these options will be completely ignored.  See L<perlvar/C<${^UNICODE}>> and
Packit d0f5c2
L<"C<-C>" in perlrun|perlrun/-C [numberE<sol>list]> for details.
Packit d0f5c2
Packit d0f5c2
=head2 The C<:locale> sub-pragma
Packit d0f5c2
Packit d0f5c2
Starting in v5.8.6, the encoding name may be C<:locale>.  This means that the
Packit d0f5c2
encoding is taken from the current locale, and not hard-coded by the pragma.
Packit d0f5c2
Since a script really can only be encoded in exactly one encoding, this option
Packit d0f5c2
is dangerous.  It makes sense only if the script itself is written in ASCII,
Packit d0f5c2
and all the possible locales that will be in use when the script is executed
Packit d0f5c2
are supersets of ASCII.  That means that the script itself doesn't get
Packit d0f5c2
changed, but the I/O handles have the specified encoding added, and the
Packit d0f5c2
operations like C<chr> and C<ord> use that encoding.
Packit d0f5c2
Packit d0f5c2
The logic of finding which locale C<:locale> uses is as follows:
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item 1.
Packit d0f5c2
Packit d0f5c2
If the platform supports the C<langinfo(CODESET)> interface, the codeset
Packit d0f5c2
returned is used as the default encoding for the open pragma.
Packit d0f5c2
Packit d0f5c2
=item 2.
Packit d0f5c2
Packit d0f5c2
If 1. didn't work but we are under the locale pragma, the environment
Packit d0f5c2
variables C<LC_ALL> and C<LANG> (in that order) are matched for encodings
Packit d0f5c2
(the part after "C<.>", if any), and if any found, that is used
Packit d0f5c2
as the default encoding for the open pragma.
Packit d0f5c2
Packit d0f5c2
=item 3.
Packit d0f5c2
Packit d0f5c2
If 1. and 2. didn't work, the environment variables C<LC_ALL> and C<LANG>
Packit d0f5c2
(in that order) are matched for anything looking like UTF-8, and if
Packit d0f5c2
any found, C<:utf8> is used as the default encoding for the open
Packit d0f5c2
pragma.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
If your locale environment variables (C<LC_ALL>, C<LC_CTYPE>, C<LANG>)
Packit d0f5c2
contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
Packit d0f5c2
the default encoding of your C<STDIN>, C<STDOUT>, and C<STDERR>, and of
Packit d0f5c2
B<any subsequent file open>, is UTF-8.
Packit d0f5c2
Packit d0f5c2
=head1 CAVEATS
Packit d0f5c2
Packit d0f5c2
=head2 SIDE EFFECTS
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
If the C<encoding> pragma is in scope then the lengths returned are
Packit d0f5c2
calculated from the length of C<$/> in Unicode characters, which is not
Packit d0f5c2
always the same as the length of C<$/> in the native encoding.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Without this pragma, if strings operating under byte semantics and strings
Packit d0f5c2
with Unicode character data are concatenated, the new string will
Packit d0f5c2
be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
Packit d0f5c2
Packit d0f5c2
The B<encoding> pragma changes this to use the specified encoding
Packit d0f5c2
instead.  For example:
Packit d0f5c2
Packit d0f5c2
    use encoding 'utf8';
Packit d0f5c2
    my $string = chr(20000); # a Unicode string
Packit d0f5c2
    utf8::encode($string);   # now it's a UTF-8 encoded byte string
Packit d0f5c2
    # concatenate with another Unicode string
Packit d0f5c2
    print length($string . chr(20000));
Packit d0f5c2
Packit d0f5c2
Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
Packit d0f5c2
C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
Packit d0f5c2
is three octets when interpreted as Latin-1.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head2 DO NOT MIX MULTIPLE ENCODINGS
Packit d0f5c2
Packit d0f5c2
Notice that only literals (string or regular expression) having only
Packit d0f5c2
legacy code points are affected: if you mix data like this
Packit d0f5c2
Packit d0f5c2
    \x{100}\xDF
Packit d0f5c2
    \xDF\x{100}
Packit d0f5c2
Packit d0f5c2
the data is assumed to be in (Latin 1 and) Unicode, not in your native
Packit d0f5c2
encoding.  In other words, this will match in "greek":
Packit d0f5c2
Packit d0f5c2
    "\xDF" =~ /\x{3af}/
Packit d0f5c2
Packit d0f5c2
but this will not
Packit d0f5c2
Packit d0f5c2
    "\xDF\x{100}" =~ /\x{3af}\x{100}/
Packit d0f5c2
Packit d0f5c2
since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
Packit d0f5c2
the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
Packit d0f5c2
LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
Packit d0f5c2
should not be mixing your legacy data and Unicode in the same string.
Packit d0f5c2
Packit d0f5c2
This pragma also affects encoding of the 0x80..0xFF code point range:
Packit d0f5c2
normally characters in that range are left as eight-bit bytes (unless
Packit d0f5c2
they are combined with characters with code points 0x100 or larger,
Packit d0f5c2
in which case all characters need to become UTF-8 encoded), but if
Packit d0f5c2
the C<encoding> pragma is present, even the 0x80..0xFF range always
Packit d0f5c2
gets UTF-8 encoded.
Packit d0f5c2
Packit d0f5c2
After all, the best thing about this pragma is that you don't have to
Packit d0f5c2
resort to \x{....} just to spell your name in a native encoding.
Packit d0f5c2
So feel free to put your strings in your encoding in quotes and
Packit d0f5c2
regexes.
Packit d0f5c2
Packit d0f5c2
=head2 Prior to Perl v5.22
Packit d0f5c2
Packit d0f5c2
The pragma was a per script, not a per block lexical.  Only the last
Packit d0f5c2
C<use encoding> or C<no encoding> mattered, and it affected
Packit d0f5c2
B<the whole script>.  However, the C<no encoding> pragma was supported and
Packit d0f5c2
C<use encoding> could appear as many times as you want in a given script
Packit d0f5c2
(though only the last was effective).
Packit d0f5c2
Packit d0f5c2
Since the scope wasn't lexical, other modules' use of C<chr>, C<ord>, I<etc.>
Packit d0f5c2
were affected.  This leads to spooky, incorrect action at a distance that is
Packit d0f5c2
hard to debug.
Packit d0f5c2
Packit d0f5c2
This means you would have to be very careful of the load order:
Packit d0f5c2
Packit d0f5c2
  # called module
Packit d0f5c2
  package Module_IN_BAR;
Packit d0f5c2
  use encoding "bar";
Packit d0f5c2
  # stuff in "bar" encoding here
Packit d0f5c2
  1;
Packit d0f5c2
Packit d0f5c2
  # caller script
Packit d0f5c2
  use encoding "foo"
Packit d0f5c2
  use Module_IN_BAR;
Packit d0f5c2
  # surprise! use encoding "bar" is in effect.
Packit d0f5c2
Packit d0f5c2
The best way to avoid this oddity is to use this pragma RIGHT AFTER
Packit d0f5c2
other modules are loaded.  i.e.
Packit d0f5c2
Packit d0f5c2
  use Module_IN_BAR;
Packit d0f5c2
  use encoding "foo";
Packit d0f5c2
Packit d0f5c2
=head2 Prior to Encode version 1.87
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
C<STDIN> and C<STDOUT> were not set under the filter option.
Packit d0f5c2
And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> didn't work like
Packit d0f5c2
non-filter version.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
C<use utf8> wasn't implicitly declared so you have to C<use utf8> to do
Packit d0f5c2
Packit d0f5c2
 ${"\x{4eba}"}++
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head2 Prior to Perl v5.8.1
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item "NON-EUC" doublebyte encodings
Packit d0f5c2
Packit d0f5c2
Because perl needs to parse the script before applying this pragma, such
Packit d0f5c2
encodings as Shift_JIS and Big-5 that may contain C<'\'> (BACKSLASH;
Packit d0f5c2
C<\x5c>) in the second byte fail because the second byte may
Packit d0f5c2
accidentally escape the quoting character that follows.
Packit d0f5c2
Packit d0f5c2
=item C
Packit d0f5c2
Packit d0f5c2
The B<encoding> pragma works by decoding string literals in
Packit d0f5c2
C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl v5.8.0, this
Packit d0f5c2
does not apply to C.  Therefore,
Packit d0f5c2
Packit d0f5c2
  use encoding 'euc-jp';
Packit d0f5c2
  #....
Packit d0f5c2
  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
Packit d0f5c2
  #           -------- -------- -------- --------
Packit d0f5c2
Packit d0f5c2
Does not work as
Packit d0f5c2
Packit d0f5c2
  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item Legend of characters above
Packit d0f5c2
Packit d0f5c2
  utf8     euc-jp   charnames::viacode()
Packit d0f5c2
  -----------------------------------------
Packit d0f5c2
  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
Packit d0f5c2
  \x{3093} \xA4\xF3 HIRAGANA LETTER N
Packit d0f5c2
  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
Packit d0f5c2
  \x{30f3} \xA5\xF3 KATAKANA LETTER N
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
This counterintuitive behavior has been fixed in perl v5.8.1.
Packit d0f5c2
Packit d0f5c2
In perl v5.8.0, you can work around this as follows;
Packit d0f5c2
Packit d0f5c2
  use encoding 'euc-jp';
Packit d0f5c2
  #  ....
Packit d0f5c2
  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
Packit d0f5c2
Packit d0f5c2
Note the C expression is surrounded by C<qq{}>.  The idea behind
Packit d0f5c2
this is the same as the classic idiom that makes C 'interpolate':
Packit d0f5c2
Packit d0f5c2
   tr/$from/$to/;            # wrong!
Packit d0f5c2
   eval qq{ tr/$from/$to/ }; # workaround.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 EXAMPLE - Greekperl
Packit d0f5c2
Packit d0f5c2
    use encoding "iso 8859-7";
Packit d0f5c2
Packit d0f5c2
    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
Packit d0f5c2
Packit d0f5c2
    $a = "\xDF";
Packit d0f5c2
    $b = "\x{100}";
Packit d0f5c2
Packit d0f5c2
    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
Packit d0f5c2
Packit d0f5c2
    $c = $a . $b;
Packit d0f5c2
Packit d0f5c2
    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
Packit d0f5c2
Packit d0f5c2
    # chr() is affected, and ...
Packit d0f5c2
Packit d0f5c2
    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
Packit d0f5c2
Packit d0f5c2
    # ... ord() is affected by the encoding pragma ...
Packit d0f5c2
Packit d0f5c2
    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
Packit d0f5c2
Packit d0f5c2
    # ... as are eq and cmp ...
Packit d0f5c2
Packit d0f5c2
    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
Packit d0f5c2
    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
Packit d0f5c2
Packit d0f5c2
    # ... but pack/unpack C are not affected, in case you still
Packit d0f5c2
    # want to go back to your native encoding
Packit d0f5c2
Packit d0f5c2
    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
Packit d0f5c2
Packit d0f5c2
=head1 BUGS
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item Thread safety
Packit d0f5c2
Packit d0f5c2
C<use encoding ...> is not thread-safe (i.e., do not use in threaded
Packit d0f5c2
applications).
Packit d0f5c2
Packit d0f5c2
=item Can't be used by more than one module in a single program.
Packit d0f5c2
Packit d0f5c2
Only one encoding is allowed.  If you combine modules in a program that have
Packit d0f5c2
different encodings, only one will be actually used.
Packit d0f5c2
Packit d0f5c2
=item Other modules using C<STDIN> and C<STDOUT> get the encoded stream
Packit d0f5c2
Packit d0f5c2
They may be expecting something completely different.
Packit d0f5c2
Packit d0f5c2
=item literals in regex that are longer than 127 bytes
Packit d0f5c2
Packit d0f5c2
For native multibyte encodings (either fixed or variable length),
Packit d0f5c2
the current implementation of the regular expressions may introduce
Packit d0f5c2
recoding errors for regular expression literals longer than 127 bytes.
Packit d0f5c2
Packit d0f5c2
=item EBCDIC
Packit d0f5c2
Packit d0f5c2
The encoding pragma is not supported on EBCDIC platforms.
Packit d0f5c2
Packit d0f5c2
=item C<format>
Packit d0f5c2
Packit d0f5c2
This pragma doesn't work well with C<format> because PerlIO does not
Packit d0f5c2
get along very well with it.  When C<format> contains non-ASCII
Packit d0f5c2
characters it prints funny or gets "wide character warnings".
Packit d0f5c2
To understand it, try the code below.
Packit d0f5c2
Packit d0f5c2
  # Save this one in utf8
Packit d0f5c2
  # replace *non-ascii* with a non-ascii string
Packit d0f5c2
  my $camel;
Packit d0f5c2
  format STDOUT =
Packit d0f5c2
  *non-ascii*@>>>>>>>
Packit d0f5c2
  $camel
Packit d0f5c2
  .
Packit d0f5c2
  $camel = "*non-ascii*";
Packit d0f5c2
  binmode(STDOUT=>':encoding(utf8)'); # bang!
Packit d0f5c2
  write;              # funny
Packit d0f5c2
  print $camel, "\n"; # fine
Packit d0f5c2
Packit d0f5c2
Without binmode this happens to work but without binmode, print()
Packit d0f5c2
fails instead of write().
Packit d0f5c2
Packit d0f5c2
At any rate, the very use of C<format> is questionable when it comes to
Packit d0f5c2
unicode characters since you have to consider such things as character
Packit d0f5c2
width (i.e. double-width for ideographs) and directions (i.e. BIDI for
Packit d0f5c2
Arabic and Hebrew).
Packit d0f5c2
Packit d0f5c2
=item See also L</CAVEATS>
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 HISTORY
Packit d0f5c2
Packit d0f5c2
This pragma first appeared in Perl v5.8.0.  It has been enhanced in later
Packit d0f5c2
releases as specified above.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
Packit d0f5c2
Packit d0f5c2
Ch. 15 of C<Programming Perl (3rd Edition)>
Packit d0f5c2
by Larry Wall, Tom Christiansen, Jon Orwant;
Packit d0f5c2
O'Reilly & Associates; ISBN 0-596-00027-8
Packit d0f5c2
Packit d0f5c2
=cut