Blame Encode.pm

Packit d0f5c2
#
Packit d0f5c2
# $Id: Encode.pm,v 2.97 2018/02/21 12:14:24 dankogai Exp $
Packit d0f5c2
#
Packit d0f5c2
package Encode;
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
Packit d0f5c2
our $VERSION;
Packit d0f5c2
BEGIN {
Packit d0f5c2
    $VERSION = sprintf "%d.%02d", q$Revision: 2.97 $ =~ /(\d+)/g;
Packit d0f5c2
    require XSLoader;
Packit d0f5c2
    XSLoader::load( __PACKAGE__, $VERSION );
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
use Exporter 5.57 'import';
Packit d0f5c2
Packit d0f5c2
our @CARP_NOT = qw(Encode::Encoder);
Packit d0f5c2
Packit d0f5c2
# Public, encouraged API is exported by default
Packit d0f5c2
Packit d0f5c2
our @EXPORT = qw(
Packit d0f5c2
  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
Packit d0f5c2
  encodings  find_encoding find_mime_encoding clone_encoding
Packit d0f5c2
);
Packit d0f5c2
our @FB_FLAGS = qw(
Packit d0f5c2
  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
Packit d0f5c2
  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
Packit d0f5c2
);
Packit d0f5c2
our @FB_CONSTS = qw(
Packit d0f5c2
  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
Packit d0f5c2
  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
Packit d0f5c2
);
Packit d0f5c2
our @EXPORT_OK = (
Packit d0f5c2
    qw(
Packit d0f5c2
      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
Packit d0f5c2
      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
Packit d0f5c2
      ),
Packit d0f5c2
    @FB_FLAGS, @FB_CONSTS,
Packit d0f5c2
);
Packit d0f5c2
Packit d0f5c2
our %EXPORT_TAGS = (
Packit d0f5c2
    all          => [ @EXPORT,    @EXPORT_OK ],
Packit d0f5c2
    default      => [ @EXPORT ],
Packit d0f5c2
    fallbacks    => [ @FB_CONSTS ],
Packit d0f5c2
    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
Packit d0f5c2
);
Packit d0f5c2
Packit d0f5c2
# Documentation moved after __END__ for speed - NI-S
Packit d0f5c2
Packit d0f5c2
our $ON_EBCDIC = ( ord("A") == 193 );
Packit d0f5c2
Packit d0f5c2
use Encode::Alias ();
Packit d0f5c2
use Encode::MIME::Name;
Packit d0f5c2
Packit d0f5c2
use Storable;
Packit d0f5c2
Packit d0f5c2
# Make a %Encoding package variable to allow a certain amount of cheating
Packit d0f5c2
our %Encoding;
Packit d0f5c2
our %ExtModule;
Packit d0f5c2
require Encode::Config;
Packit d0f5c2
#  See
Packit d0f5c2
#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
Packit d0f5c2
#  to find why sig handlers inside eval{} are disabled.
Packit d0f5c2
eval {
Packit d0f5c2
    local $SIG{__DIE__};
Packit d0f5c2
    local $SIG{__WARN__};
Packit d0f5c2
    local @INC = @INC;
Packit d0f5c2
    pop @INC if $INC[-1] eq '.';
Packit d0f5c2
    require Encode::ConfigLocal;
Packit d0f5c2
};
Packit d0f5c2
Packit d0f5c2
sub encodings {
Packit d0f5c2
    my %enc;
Packit d0f5c2
    my $arg  = $_[1] || '';
Packit d0f5c2
    if ( $arg eq ":all" ) {
Packit d0f5c2
        %enc = ( %Encoding, %ExtModule );
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        %enc = %Encoding;
Packit d0f5c2
        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
Packit d0f5c2
            DEBUG and warn $mod;
Packit d0f5c2
            for my $enc ( keys %ExtModule ) {
Packit d0f5c2
                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    return sort { lc $a cmp lc $b }
Packit d0f5c2
      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub perlio_ok {
Packit d0f5c2
    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
Packit d0f5c2
    $obj->can("perlio_ok") and return $obj->perlio_ok();
Packit d0f5c2
    return 0;    # safety net
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub define_encoding {
Packit d0f5c2
    my $obj  = shift;
Packit d0f5c2
    my $name = shift;
Packit d0f5c2
    $Encoding{$name} = $obj;
Packit d0f5c2
    my $lc = lc($name);
Packit d0f5c2
    define_alias( $lc => $obj ) unless $lc eq $name;
Packit d0f5c2
    while (@_) {
Packit d0f5c2
        my $alias = shift;
Packit d0f5c2
        define_alias( $alias, $obj );
Packit d0f5c2
    }
Packit d0f5c2
    my $class = ref($obj);
Packit d0f5c2
    push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
Packit d0f5c2
    push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
Packit d0f5c2
    return $obj;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub getEncoding {
Packit d0f5c2
    my ( $class, $name, $skip_external ) = @_;
Packit d0f5c2
Packit d0f5c2
    defined($name) or return;
Packit d0f5c2
Packit d0f5c2
    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
Packit d0f5c2
Packit d0f5c2
    ref($name) && $name->can('renew') and return $name;
Packit d0f5c2
    exists $Encoding{$name} and return $Encoding{$name};
Packit d0f5c2
    my $lc = lc $name;
Packit d0f5c2
    exists $Encoding{$lc} and return $Encoding{$lc};
Packit d0f5c2
Packit d0f5c2
    my $oc = $class->find_alias($name);
Packit d0f5c2
    defined($oc) and return $oc;
Packit d0f5c2
    $lc ne $name and $oc = $class->find_alias($lc);
Packit d0f5c2
    defined($oc) and return $oc;
Packit d0f5c2
Packit d0f5c2
    unless ($skip_external) {
Packit d0f5c2
        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
Packit d0f5c2
            $mod =~ s,::,/,g;
Packit d0f5c2
            $mod .= '.pm';
Packit d0f5c2
            eval { require $mod; };
Packit d0f5c2
            exists $Encoding{$name} and return $Encoding{$name};
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    return;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
# HACK: These two functions must be defined in Encode and because of
Packit d0f5c2
# cyclic dependency between Encode and Encode::Alias, Exporter does not work
Packit d0f5c2
sub find_alias {
Packit d0f5c2
    goto &Encode::Alias::find_alias;
Packit d0f5c2
}
Packit d0f5c2
sub define_alias {
Packit d0f5c2
    goto &Encode::Alias::define_alias;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub find_encoding($;$) {
Packit d0f5c2
    my ( $name, $skip_external ) = @_;
Packit d0f5c2
    return __PACKAGE__->getEncoding( $name, $skip_external );
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub find_mime_encoding($;$) {
Packit d0f5c2
    my ( $mime_name, $skip_external ) = @_;
Packit d0f5c2
    my $name = Encode::MIME::Name::get_encode_name( $mime_name );
Packit d0f5c2
    return find_encoding( $name, $skip_external );
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub resolve_alias($) {
Packit d0f5c2
    my $obj = find_encoding(shift);
Packit d0f5c2
    defined $obj and return $obj->name;
Packit d0f5c2
    return;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub clone_encoding($) {
Packit d0f5c2
    my $obj = find_encoding(shift);
Packit d0f5c2
    ref $obj or return;
Packit d0f5c2
    return Storable::dclone($obj);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encode($$;$) {
Packit d0f5c2
    my ( $name, $string, $check ) = @_;
Packit d0f5c2
    return undef unless defined $string;
Packit d0f5c2
    $string .= '';    # stringify;
Packit d0f5c2
    $check ||= 0;
Packit d0f5c2
    unless ( defined $name ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("Encoding name should not be undef");
Packit d0f5c2
    }
Packit d0f5c2
    my $enc = find_encoding($name);
Packit d0f5c2
    unless ( defined $enc ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("Unknown encoding '$name'");
Packit d0f5c2
    }
Packit d0f5c2
    # For Unicode, warnings need to be caught and re-issued at this level
Packit d0f5c2
    # so that callers can disable utf8 warnings lexically.
Packit d0f5c2
    my $octets;
Packit d0f5c2
    if ( ref($enc) eq 'Encode::Unicode' ) {
Packit d0f5c2
        my $warn = '';
Packit d0f5c2
        {
Packit d0f5c2
            local $SIG{__WARN__} = sub { $warn = shift };
Packit d0f5c2
            $octets = $enc->encode( $string, $check );
Packit d0f5c2
        }
Packit d0f5c2
        warnings::warnif('utf8', $warn) if length $warn;
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        $octets = $enc->encode( $string, $check );
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = $string if $check and !ref $check and !( $check & LEAVE_SRC );
Packit d0f5c2
    return $octets;
Packit d0f5c2
}
Packit d0f5c2
*str2bytes = \&encode;
Packit d0f5c2
Packit d0f5c2
sub decode($$;$) {
Packit d0f5c2
    my ( $name, $octets, $check ) = @_;
Packit d0f5c2
    return undef unless defined $octets;
Packit d0f5c2
    $octets .= '';
Packit d0f5c2
    $check ||= 0;
Packit d0f5c2
    my $enc = find_encoding($name);
Packit d0f5c2
    unless ( defined $enc ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("Unknown encoding '$name'");
Packit d0f5c2
    }
Packit d0f5c2
    # For Unicode, warnings need to be caught and re-issued at this level
Packit d0f5c2
    # so that callers can disable utf8 warnings lexically.
Packit d0f5c2
    my $string;
Packit d0f5c2
    if ( ref($enc) eq 'Encode::Unicode' ) {
Packit d0f5c2
        my $warn = '';
Packit d0f5c2
        {
Packit d0f5c2
            local $SIG{__WARN__} = sub { $warn = shift };
Packit d0f5c2
            $string = $enc->decode( $octets, $check );
Packit d0f5c2
        }
Packit d0f5c2
        warnings::warnif('utf8', $warn) if length $warn;
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        $string = $enc->decode( $octets, $check );
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = $octets if $check and !ref $check and !( $check & LEAVE_SRC );
Packit d0f5c2
    return $string;
Packit d0f5c2
}
Packit d0f5c2
*bytes2str = \&decode;
Packit d0f5c2
Packit d0f5c2
sub from_to($$$;$) {
Packit d0f5c2
    my ( $string, $from, $to, $check ) = @_;
Packit d0f5c2
    return undef unless defined $string;
Packit d0f5c2
    $check ||= 0;
Packit d0f5c2
    my $f = find_encoding($from);
Packit d0f5c2
    unless ( defined $f ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("Unknown encoding '$from'");
Packit d0f5c2
    }
Packit d0f5c2
    my $t = find_encoding($to);
Packit d0f5c2
    unless ( defined $t ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak("Unknown encoding '$to'");
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    # For Unicode, warnings need to be caught and re-issued at this level
Packit d0f5c2
    # so that callers can disable utf8 warnings lexically.
Packit d0f5c2
    my $uni;
Packit d0f5c2
    if ( ref($f) eq 'Encode::Unicode' ) {
Packit d0f5c2
        my $warn = '';
Packit d0f5c2
        {
Packit d0f5c2
            local $SIG{__WARN__} = sub { $warn = shift };
Packit d0f5c2
            $uni = $f->decode($string);
Packit d0f5c2
        }
Packit d0f5c2
        warnings::warnif('utf8', $warn) if length $warn;
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        $uni = $f->decode($string);
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    if ( ref($t) eq 'Encode::Unicode' ) {
Packit d0f5c2
        my $warn = '';
Packit d0f5c2
        {
Packit d0f5c2
            local $SIG{__WARN__} = sub { $warn = shift };
Packit d0f5c2
            $_[0] = $string = $t->encode( $uni, $check );
Packit d0f5c2
        }
Packit d0f5c2
        warnings::warnif('utf8', $warn) if length $warn;
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        $_[0] = $string = $t->encode( $uni, $check );
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    return undef if ( $check && length($uni) );
Packit d0f5c2
    return defined( $_[0] ) ? length($string) : undef;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub encode_utf8($) {
Packit d0f5c2
    my ($str) = @_;
Packit d0f5c2
    return undef unless defined $str;
Packit d0f5c2
    utf8::encode($str);
Packit d0f5c2
    return $str;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my $utf8enc;
Packit d0f5c2
Packit d0f5c2
sub decode_utf8($;$) {
Packit d0f5c2
    my ( $octets, $check ) = @_;
Packit d0f5c2
    return undef unless defined $octets;
Packit d0f5c2
    $octets .= '';
Packit d0f5c2
    $check   ||= 0;
Packit d0f5c2
    $utf8enc ||= find_encoding('utf8');
Packit d0f5c2
    my $string = $utf8enc->decode( $octets, $check );
Packit d0f5c2
    $_[0] = $octets if $check and !ref $check and !( $check & LEAVE_SRC );
Packit d0f5c2
    return $string;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
onBOOT;
Packit d0f5c2
Packit d0f5c2
if ($ON_EBCDIC) {
Packit d0f5c2
    package Encode::UTF_EBCDIC;
Packit d0f5c2
    use parent 'Encode::Encoding';
Packit d0f5c2
    my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
Packit d0f5c2
    Encode::define_encoding($obj, 'Unicode');
Packit d0f5c2
    sub decode {
Packit d0f5c2
        my ( undef, $str, $chk ) = @_;
Packit d0f5c2
        my $res = '';
Packit d0f5c2
        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
Packit d0f5c2
            $res .=
Packit d0f5c2
              chr(
Packit d0f5c2
                utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
Packit d0f5c2
              );
Packit d0f5c2
        }
Packit d0f5c2
        $_[1] = '' if $chk;
Packit d0f5c2
        return $res;
Packit d0f5c2
    }
Packit d0f5c2
    sub encode {
Packit d0f5c2
        my ( undef, $str, $chk ) = @_;
Packit d0f5c2
        my $res = '';
Packit d0f5c2
        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
Packit d0f5c2
            $res .=
Packit d0f5c2
              chr(
Packit d0f5c2
                utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
Packit d0f5c2
              );
Packit d0f5c2
        }
Packit d0f5c2
        $_[1] = '' if $chk;
Packit d0f5c2
        return $res;
Packit d0f5c2
    }
Packit d0f5c2
} else {
Packit d0f5c2
    package Encode::Internal;
Packit d0f5c2
    use parent 'Encode::Encoding';
Packit d0f5c2
    my $obj = bless { Name => "Internal" } => "Encode::Internal";
Packit d0f5c2
    Encode::define_encoding($obj, 'Unicode');
Packit d0f5c2
    sub decode {
Packit d0f5c2
        my ( undef, $str, $chk ) = @_;
Packit d0f5c2
        utf8::upgrade($str);
Packit d0f5c2
        $_[1] = '' if $chk;
Packit d0f5c2
        return $str;
Packit d0f5c2
    }
Packit d0f5c2
    *encode = \&decode;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
{
Packit d0f5c2
    # https://rt.cpan.org/Public/Bug/Display.html?id=103253
Packit d0f5c2
    package Encode::XS;
Packit d0f5c2
    use parent 'Encode::Encoding';
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
{
Packit d0f5c2
    package Encode::utf8;
Packit d0f5c2
    use parent 'Encode::Encoding';
Packit d0f5c2
    my %obj = (
Packit d0f5c2
        'utf8'         => { Name => 'utf8' },
Packit d0f5c2
        'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
Packit d0f5c2
    );
Packit d0f5c2
    for ( keys %obj ) {
Packit d0f5c2
        bless $obj{$_} => __PACKAGE__;
Packit d0f5c2
        Encode::define_encoding( $obj{$_} => $_ );
Packit d0f5c2
    }
Packit d0f5c2
    sub cat_decode {
Packit d0f5c2
        # ($obj, $dst, $src, $pos, $trm, $chk)
Packit d0f5c2
        # currently ignores $chk
Packit d0f5c2
        my ( undef, undef, undef, $pos, $trm ) = @_;
Packit d0f5c2
        my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
Packit d0f5c2
        use bytes;
Packit d0f5c2
        if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
Packit d0f5c2
            $$rdst .=
Packit d0f5c2
              substr( $$rsrc, $pos, $npos - $pos + length($trm) );
Packit d0f5c2
            $$rpos = $npos + length($trm);
Packit d0f5c2
            return 1;
Packit d0f5c2
        }
Packit d0f5c2
        $$rdst .= substr( $$rsrc, $pos );
Packit d0f5c2
        $$rpos = length($$rsrc);
Packit d0f5c2
        return '';
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
1;
Packit d0f5c2
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
Encode - character encodings in Perl
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
    use Encode qw(decode encode);
Packit d0f5c2
    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
Packit d0f5c2
    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
Packit d0f5c2
Packit d0f5c2
=head2 Table of Contents
Packit d0f5c2
Packit d0f5c2
Encode consists of a collection of modules whose details are too extensive
Packit d0f5c2
to fit in one document.  This one itself explains the top-level APIs
Packit d0f5c2
and general topics at a glance.  For other topics and more details,
Packit d0f5c2
see the documentation for these modules:
Packit d0f5c2
Packit d0f5c2
=over 2
Packit d0f5c2
Packit d0f5c2
=item L<Encode::Alias> - Alias definitions to encodings
Packit d0f5c2
Packit d0f5c2
=item L<Encode::Encoding> - Encode Implementation Base Class
Packit d0f5c2
Packit d0f5c2
=item L<Encode::Supported> - List of Supported Encodings
Packit d0f5c2
Packit d0f5c2
=item L<Encode::CN> - Simplified Chinese Encodings
Packit d0f5c2
Packit d0f5c2
=item L<Encode::JP> - Japanese Encodings
Packit d0f5c2
Packit d0f5c2
=item L<Encode::KR> - Korean Encodings
Packit d0f5c2
Packit d0f5c2
=item L<Encode::TW> - Traditional Chinese Encodings
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
The C<Encode> module provides the interface between Perl strings
Packit d0f5c2
and the rest of the system.  Perl strings are sequences of
Packit d0f5c2
I<characters>.
Packit d0f5c2
Packit d0f5c2
The repertoire of characters that Perl can represent is a superset of those
Packit d0f5c2
defined by the Unicode Consortium. On most platforms the ordinal
Packit d0f5c2
values of a character as returned by C<ord(I<S>)> is the I
Packit d0f5c2
codepoint> for that character. The exceptions are platforms where
Packit d0f5c2
the legacy encoding is some variant of EBCDIC rather than a superset
Packit d0f5c2
of ASCII; see L<perlebcdic>.
Packit d0f5c2
Packit d0f5c2
During recent history, data is moved around a computer in 8-bit chunks,
Packit d0f5c2
often called "bytes" but also known as "octets" in standards documents.
Packit d0f5c2
Perl is widely used to manipulate data of many types: not only strings of
Packit d0f5c2
characters representing human or computer languages, but also "binary"
Packit d0f5c2
data, being the machine's representation of numbers, pixels in an image, or
Packit d0f5c2
just about anything.
Packit d0f5c2
Packit d0f5c2
When Perl is processing "binary data", the programmer wants Perl to
Packit d0f5c2
process "sequences of bytes". This is not a problem for Perl: because a
Packit d0f5c2
byte has 256 possible values, it easily fits in Perl's much larger
Packit d0f5c2
"logical character".
Packit d0f5c2
Packit d0f5c2
This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
Packit d0f5c2
explain the I<why>.
Packit d0f5c2
Packit d0f5c2
=head2 TERMINOLOGY
Packit d0f5c2
Packit d0f5c2
=head3 character
Packit d0f5c2
Packit d0f5c2
A character in the range 0 .. 2**32-1 (or more);
Packit d0f5c2
what Perl's strings are made of.
Packit d0f5c2
Packit d0f5c2
=head3 byte
Packit d0f5c2
Packit d0f5c2
A character in the range 0..255;
Packit d0f5c2
a special case of a Perl character.
Packit d0f5c2
Packit d0f5c2
=head3 octet
Packit d0f5c2
Packit d0f5c2
8 bits of data, with ordinal values 0..255;
Packit d0f5c2
term for bytes passed to or from a non-Perl context, such as a disk file,
Packit d0f5c2
standard I/O stream, database, command-line argument, environment variable,
Packit d0f5c2
socket etc.
Packit d0f5c2
Packit d0f5c2
=head1 THE PERL ENCODING API
Packit d0f5c2
Packit d0f5c2
=head2 Basic methods
Packit d0f5c2
Packit d0f5c2
=head3 encode
Packit d0f5c2
Packit d0f5c2
  $octets  = encode(ENCODING, STRING[, CHECK])
Packit d0f5c2
Packit d0f5c2
Encodes the scalar value I<STRING> from Perl's internal form into
Packit d0f5c2
I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
Packit d0f5c2
canonical name or an alias.  For encoding names and aliases, see
Packit d0f5c2
L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
Packit d0f5c2
on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
Packit d0f5c2
left unchanged.
Packit d0f5c2
Packit d0f5c2
For example, to convert a string from Perl's internal format into
Packit d0f5c2
ISO-8859-1, also known as Latin1:
Packit d0f5c2
Packit d0f5c2
  $octets = encode("iso-8859-1", $string);
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
Packit d0f5c2
$octets I<might not be equal to> $string.  Though both contain the
Packit d0f5c2
same data, the UTF8 flag for $octets is I<always> off.  When you
Packit d0f5c2
encode anything, the UTF8 flag on the result is always off, even when it
Packit d0f5c2
contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
Packit d0f5c2
Packit d0f5c2
If the $string is C<undef>, then C<undef> is returned.
Packit d0f5c2
Packit d0f5c2
C<str2bytes> may be used as an alias for C<encode>.
Packit d0f5c2
Packit d0f5c2
=head3 decode
Packit d0f5c2
Packit d0f5c2
  $string = decode(ENCODING, OCTETS[, CHECK])
Packit d0f5c2
Packit d0f5c2
This function returns the string that results from decoding the scalar
Packit d0f5c2
value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
Packit d0f5c2
Perl's internal form.  As with encode(),
Packit d0f5c2
I<ENCODING> can be either a canonical name or an alias. For encoding names
Packit d0f5c2
and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
Packit d0f5c2
Malformed Data">.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
Packit d0f5c2
on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
Packit d0f5c2
left unchanged.
Packit d0f5c2
Packit d0f5c2
For example, to convert ISO-8859-1 data into a string in Perl's
Packit d0f5c2
internal format:
Packit d0f5c2
Packit d0f5c2
  $string = decode("iso-8859-1", $octets);
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
Packit d0f5c2
I<might not be equal to> $octets.  Though both contain the same data, the
Packit d0f5c2
UTF8 flag for $string is on.  See L</"The UTF8 flag">
Packit d0f5c2
below.
Packit d0f5c2
Packit d0f5c2
If the $string is C<undef>, then C<undef> is returned.
Packit d0f5c2
Packit d0f5c2
C<bytes2str> may be used as an alias for C<decode>.
Packit d0f5c2
Packit d0f5c2
=head3 find_encoding
Packit d0f5c2
Packit d0f5c2
  [$obj =] find_encoding(ENCODING)
Packit d0f5c2
Packit d0f5c2
Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
Packit d0f5c2
C<undef> if no matching I<ENCODING> is find.  The returned object is
Packit d0f5c2
what does the actual encoding or decoding.
Packit d0f5c2
Packit d0f5c2
  $string = decode($name, $bytes);
Packit d0f5c2
Packit d0f5c2
is in fact
Packit d0f5c2
Packit d0f5c2
    $string = do {
Packit d0f5c2
        $obj = find_encoding($name);
Packit d0f5c2
        croak qq(encoding "$name" not found) unless ref $obj;
Packit d0f5c2
        $obj->decode($bytes);
Packit d0f5c2
    };
Packit d0f5c2
Packit d0f5c2
with more error checking.
Packit d0f5c2
Packit d0f5c2
You can therefore save time by reusing this object as follows;
Packit d0f5c2
Packit d0f5c2
    my $enc = find_encoding("iso-8859-1");
Packit d0f5c2
    while(<>) {
Packit d0f5c2
        my $string = $enc->decode($_);
Packit d0f5c2
        ... # now do something with $string;
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
Besides L</decode> and L</encode>, other methods are
Packit d0f5c2
available as well.  For instance, C<name()> returns the canonical
Packit d0f5c2
name of the encoding object.
Packit d0f5c2
Packit d0f5c2
  find_encoding("latin1")->name; # iso-8859-1
Packit d0f5c2
Packit d0f5c2
See L<Encode::Encoding> for details.
Packit d0f5c2
Packit d0f5c2
=head3 find_mime_encoding
Packit d0f5c2
Packit d0f5c2
  [$obj =] find_mime_encoding(MIME_ENCODING)
Packit d0f5c2
Packit d0f5c2
Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
Packit d0f5c2
same as C<find_encoding()> but C<mime_name()> of returned object must
Packit d0f5c2
match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
Packit d0f5c2
canonical names and aliases are not used when searching for object.
Packit d0f5c2
Packit d0f5c2
    find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
Packit d0f5c2
    find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
Packit d0f5c2
    find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
Packit d0f5c2
    find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
Packit d0f5c2
Packit d0f5c2
=head3 from_to
Packit d0f5c2
Packit d0f5c2
  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
Packit d0f5c2
Packit d0f5c2
Converts I<in-place> data between two encodings. The data in $octets
Packit d0f5c2
must be encoded as octets and I<not> as characters in Perl's internal
Packit d0f5c2
format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
Packit d0f5c2
encoding:
Packit d0f5c2
Packit d0f5c2
  from_to($octets, "iso-8859-1", "cp1250");
Packit d0f5c2
Packit d0f5c2
and to convert it back:
Packit d0f5c2
Packit d0f5c2
  from_to($octets, "cp1250", "iso-8859-1");
Packit d0f5c2
Packit d0f5c2
Because the conversion happens in place, the data to be
Packit d0f5c2
converted cannot be a string constant: it must be a scalar variable.
Packit d0f5c2
Packit d0f5c2
C<from_to()> returns the length of the converted string in octets on success,
Packit d0f5c2
and C<undef> on error.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: The following operations may look the same, but are not:
Packit d0f5c2
Packit d0f5c2
  from_to($data, "iso-8859-1", "UTF-8"); #1
Packit d0f5c2
  $data = decode("iso-8859-1", $data);  #2
Packit d0f5c2
Packit d0f5c2
Both #1 and #2 make $data consist of a completely valid UTF-8 string,
Packit d0f5c2
but only #2 turns the UTF8 flag on.  #1 is equivalent to:
Packit d0f5c2
Packit d0f5c2
  $data = encode("UTF-8", decode("iso-8859-1", $data));
Packit d0f5c2
Packit d0f5c2
See L</"The UTF8 flag"> below.
Packit d0f5c2
Packit d0f5c2
Also note that:
Packit d0f5c2
Packit d0f5c2
  from_to($octets, $from, $to, $check);
Packit d0f5c2
Packit d0f5c2
is equivalent to:
Packit d0f5c2
Packit d0f5c2
  $octets = encode($to, decode($from, $octets), $check);
Packit d0f5c2
Packit d0f5c2
Yes, it does I<not> respect the $check during decoding.  It is
Packit d0f5c2
deliberately done that way.  If you need minute control, use C<decode>
Packit d0f5c2
followed by C<encode> as follows:
Packit d0f5c2
Packit d0f5c2
  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
Packit d0f5c2
Packit d0f5c2
=head3 encode_utf8
Packit d0f5c2
Packit d0f5c2
  $octets = encode_utf8($string);
Packit d0f5c2
Packit d0f5c2
Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
Packit d0f5c2
$string are encoded in Perl's internal format, and the result is returned
Packit d0f5c2
as a sequence of octets.  Because all possible characters in Perl have a
Packit d0f5c2
(loose, not strict) utf8 representation, this function cannot fail.
Packit d0f5c2
Packit d0f5c2
B<WARNING>: do not use this function for data exchange as it can produce
Packit d0f5c2
not strict utf8 $octets! For strictly valid UTF-8 output use
Packit d0f5c2
C<$octets = encode("UTF-8", $string)>.
Packit d0f5c2
Packit d0f5c2
=head3 decode_utf8
Packit d0f5c2
Packit d0f5c2
  $string = decode_utf8($octets [, CHECK]);
Packit d0f5c2
Packit d0f5c2
Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
Packit d0f5c2
The sequence of octets represented by $octets is decoded
Packit d0f5c2
from (loose, not strict) utf8 into a sequence of logical characters.
Packit d0f5c2
Because not all sequences of octets are valid not strict utf8,
Packit d0f5c2
it is quite possible for this function to fail.
Packit d0f5c2
For CHECK, see L</"Handling Malformed Data">.
Packit d0f5c2
Packit d0f5c2
B<WARNING>: do not use this function for data exchange as it can produce
Packit d0f5c2
$string with not strict utf8 representation! For strictly valid UTF-8
Packit d0f5c2
$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: the input I<$octets> might be modified in-place depending on
Packit d0f5c2
what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
Packit d0f5c2
left unchanged.
Packit d0f5c2
Packit d0f5c2
=head2 Listing available encodings
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  @list = Encode->encodings();
Packit d0f5c2
Packit d0f5c2
Returns a list of canonical names of available encodings that have already
Packit d0f5c2
been loaded.  To get a list of all available encodings including those that
Packit d0f5c2
have not yet been loaded, say:
Packit d0f5c2
Packit d0f5c2
  @all_encodings = Encode->encodings(":all");
Packit d0f5c2
Packit d0f5c2
Or you can give the name of a specific module:
Packit d0f5c2
Packit d0f5c2
  @with_jp = Encode->encodings("Encode::JP");
Packit d0f5c2
Packit d0f5c2
When "C<::>" is not in the name, "C<Encode::>" is assumed.
Packit d0f5c2
Packit d0f5c2
  @ebcdic = Encode->encodings("EBCDIC");
Packit d0f5c2
Packit d0f5c2
To find out in detail which encodings are supported by this package,
Packit d0f5c2
see L<Encode::Supported>.
Packit d0f5c2
Packit d0f5c2
=head2 Defining Aliases
Packit d0f5c2
Packit d0f5c2
To add a new alias to a given encoding, use:
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  use Encode::Alias;
Packit d0f5c2
  define_alias(NEWNAME => ENCODING);
Packit d0f5c2
Packit d0f5c2
After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
Packit d0f5c2
I<ENCODING> may be either the name of an encoding or an
Packit d0f5c2
I<encoding object>.
Packit d0f5c2
Packit d0f5c2
Before you do that, first make sure the alias is nonexistent using
Packit d0f5c2
C<resolve_alias()>, which returns the canonical name thereof.
Packit d0f5c2
For example:
Packit d0f5c2
Packit d0f5c2
  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
Packit d0f5c2
  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
Packit d0f5c2
  Encode::resolve_alias($name) eq $name  # true if $name is canonical
Packit d0f5c2
Packit d0f5c2
C<resolve_alias()> does not need C<use Encode::Alias>; it can be
Packit d0f5c2
imported via C<use Encode qw(resolve_alias)>.
Packit d0f5c2
Packit d0f5c2
See L<Encode::Alias> for details.
Packit d0f5c2
Packit d0f5c2
=head2 Finding IANA Character Set Registry names
Packit d0f5c2
Packit d0f5c2
The canonical name of a given encoding does not necessarily agree with
Packit d0f5c2
IANA Character Set Registry, commonly seen as C<< Content-Type:
Packit d0f5c2
text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
Packit d0f5c2
works, but sometimes it does not, most notably with "utf-8-strict".
Packit d0f5c2
Packit d0f5c2
As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  my $enc = find_encoding("UTF-8");
Packit d0f5c2
  warn $enc->name;      # utf-8-strict
Packit d0f5c2
  warn $enc->mime_name; # UTF-8
Packit d0f5c2
Packit d0f5c2
See also:  L<Encode::Encoding>
Packit d0f5c2
Packit d0f5c2
=head1 Encoding via PerlIO
Packit d0f5c2
Packit d0f5c2
If your perl supports C<PerlIO> (which is the default), you can use a
Packit d0f5c2
C<PerlIO> layer to decode and encode directly via a filehandle.  The
Packit d0f5c2
following two examples are fully identical in functionality:
Packit d0f5c2
Packit d0f5c2
  ### Version 1 via PerlIO
Packit d0f5c2
    open(INPUT,  "< :encoding(shiftjis)", $infile)
Packit d0f5c2
        || die "Can't open < $infile for reading: $!";
Packit d0f5c2
    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
Packit d0f5c2
        || die "Can't open > $output for writing: $!";
Packit d0f5c2
    while (<INPUT>) {   # auto decodes $_
Packit d0f5c2
        print OUTPUT;   # auto encodes $_
Packit d0f5c2
    }
Packit d0f5c2
    close(INPUT)   || die "can't close $infile: $!";
Packit d0f5c2
    close(OUTPUT)  || die "can't close $outfile: $!";
Packit d0f5c2
Packit d0f5c2
  ### Version 2 via from_to()
Packit d0f5c2
    open(INPUT,  "< :raw", $infile)
Packit d0f5c2
        || die "Can't open < $infile for reading: $!";
Packit d0f5c2
    open(OUTPUT, "> :raw",  $outfile)
Packit d0f5c2
        || die "Can't open > $output for writing: $!";
Packit d0f5c2
Packit d0f5c2
    while (<INPUT>) {
Packit d0f5c2
        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
Packit d0f5c2
        print OUTPUT;   # emit raw (but properly encoded) data
Packit d0f5c2
    }
Packit d0f5c2
    close(INPUT)   || die "can't close $infile: $!";
Packit d0f5c2
    close(OUTPUT)  || die "can't close $outfile: $!";
Packit d0f5c2
Packit d0f5c2
In the first version above, you let the appropriate encoding layer
Packit d0f5c2
handle the conversion.  In the second, you explicitly translate
Packit d0f5c2
from one encoding to the other.
Packit d0f5c2
Packit d0f5c2
Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
Packit d0f5c2
to see whether your encoding is supported by C<PerlIO> by invoking the
Packit d0f5c2
C<perlio_ok> method on it:
Packit d0f5c2
Packit d0f5c2
  Encode::perlio_ok("hz");             # false
Packit d0f5c2
  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
Packit d0f5c2
Packit d0f5c2
  use Encode qw(perlio_ok);            # imported upon request
Packit d0f5c2
  perlio_ok("euc-jp")
Packit d0f5c2
Packit d0f5c2
Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
Packit d0f5c2
except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
Packit d0f5c2
L<Encode::Encoding> and L<Encode::PerlIO>.
Packit d0f5c2
Packit d0f5c2
=head1 Handling Malformed Data
Packit d0f5c2
Packit d0f5c2
The optional I<CHECK> argument tells C<Encode> what to do when
Packit d0f5c2
encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
Packit d0f5c2
(== 0) is assumed.
Packit d0f5c2
Packit d0f5c2
As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
Packit d0f5c2
see below.
Packit d0f5c2
Packit d0f5c2
B<NOTE:> Not all encodings support this feature.
Packit d0f5c2
Some encodings ignore the I<CHECK> argument.  For example,
Packit d0f5c2
L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
Packit d0f5c2
Packit d0f5c2
=head2 List of I<CHECK> values
Packit d0f5c2
Packit d0f5c2
=head3 FB_DEFAULT
Packit d0f5c2
Packit d0f5c2
  I<CHECK> = Encode::FB_DEFAULT ( == 0)
Packit d0f5c2
Packit d0f5c2
If I<CHECK> is 0, encoding and decoding replace any malformed character
Packit d0f5c2
with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
Packit d0f5c2
When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
Packit d0f5c2
used.  If the data is supposed to be UTF-8, an optional lexical warning of
Packit d0f5c2
warning category C<"utf8"> is given.
Packit d0f5c2
Packit d0f5c2
=head3 FB_CROAK
Packit d0f5c2
Packit d0f5c2
  I<CHECK> = Encode::FB_CROAK ( == 1)
Packit d0f5c2
Packit d0f5c2
If I<CHECK> is 1, methods immediately die with an error
Packit d0f5c2
message.  Therefore, when I<CHECK> is 1, you should trap
Packit d0f5c2
exceptions with C<eval{}>, unless you really want to let it C<die>.
Packit d0f5c2
Packit d0f5c2
=head3 FB_QUIET
Packit d0f5c2
Packit d0f5c2
  I<CHECK> = Encode::FB_QUIET
Packit d0f5c2
Packit d0f5c2
If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
Packit d0f5c2
return the portion of the data that has been processed so far when an
Packit d0f5c2
error occurs. The data argument is overwritten with everything
Packit d0f5c2
after that point; that is, the unprocessed portion of the data.  This is
Packit d0f5c2
handy when you have to call C<decode> repeatedly in the case where your
Packit d0f5c2
source data may contain partial multi-byte character sequences,
Packit d0f5c2
(that is, you are reading with a fixed-width buffer). Here's some sample
Packit d0f5c2
code to do exactly that:
Packit d0f5c2
Packit d0f5c2
    my($buffer, $string) = ("", "");
Packit d0f5c2
    while (read($fh, $buffer, 256, length($buffer))) {
Packit d0f5c2
        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
Packit d0f5c2
        # $buffer now contains the unprocessed partial character
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
=head3 FB_WARN
Packit d0f5c2
Packit d0f5c2
  I<CHECK> = Encode::FB_WARN
Packit d0f5c2
Packit d0f5c2
This is the same as C<FB_QUIET> above, except that instead of being silent
Packit d0f5c2
on errors, it issues a warning.  This is handy for when you are debugging.
Packit d0f5c2
Packit d0f5c2
=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
Packit d0f5c2
Packit d0f5c2
=over 2
Packit d0f5c2
Packit d0f5c2
=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
Packit d0f5c2
Packit d0f5c2
=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
Packit d0f5c2
Packit d0f5c2
=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
Packit d0f5c2
C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
Packit d0f5c2
Packit d0f5c2
When you decode, C<\xI<HH>> is inserted for a malformed character, where
Packit d0f5c2
I<HH> is the hex representation of the octet that could not be decoded to
Packit d0f5c2
utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
Packit d0f5c2
the Unicode code point (in any number of hex digits) of the character that
Packit d0f5c2
cannot be found in the character repertoire of the encoding.
Packit d0f5c2
Packit d0f5c2
The HTML/XML character reference modes are about the same. In place of
Packit d0f5c2
C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
Packit d0f5c2
XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
Packit d0f5c2
Packit d0f5c2
In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
Packit d0f5c2
Packit d0f5c2
=head3 The bitmask
Packit d0f5c2
Packit d0f5c2
These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
Packit d0f5c2
constants are laid out.  You can import the C<FB_I<XXX>> constants via
Packit d0f5c2
C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
Packit d0f5c2
constants via C<use Encode qw(:fallback_all)>.
Packit d0f5c2
Packit d0f5c2
                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
Packit d0f5c2
 DIE_ON_ERR    0x0001             X
Packit d0f5c2
 WARN_ON_ERR   0x0002                               X
Packit d0f5c2
 RETURN_ON_ERR 0x0004                      X        X
Packit d0f5c2
 LEAVE_SRC     0x0008                                        X
Packit d0f5c2
 PERLQQ        0x0100                                        X
Packit d0f5c2
 HTMLCREF      0x0200
Packit d0f5c2
 XMLCREF       0x0400
Packit d0f5c2
Packit d0f5c2
=head3 LEAVE_SRC
Packit d0f5c2
Packit d0f5c2
  Encode::LEAVE_SRC
Packit d0f5c2
Packit d0f5c2
If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
Packit d0f5c2
source string to encode() or decode() will be overwritten in place.
Packit d0f5c2
If you're not interested in this, then bitwise-OR it with the bitmask.
Packit d0f5c2
Packit d0f5c2
=head2 coderef for CHECK
Packit d0f5c2
Packit d0f5c2
As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
Packit d0f5c2
ordinal value of the unmapped character as an argument and returns
Packit d0f5c2
octets that represent the fallback character.  For instance:
Packit d0f5c2
Packit d0f5c2
  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
Packit d0f5c2
Packit d0f5c2
Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
Packit d0f5c2
Packit d0f5c2
Fallback for C<decode> must return decoded string (sequence of characters)
Packit d0f5c2
and takes a list of ordinal values as its arguments. So for
Packit d0f5c2
example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
Packit d0f5c2
a fallback for bytes that are not valid UTF-8, you could write
Packit d0f5c2
Packit d0f5c2
    $str = decode 'UTF-8', $octets, sub {
Packit d0f5c2
        my $tmp = join '', map chr, @_;
Packit d0f5c2
        return decode 'ISO-8859-15', $tmp;
Packit d0f5c2
    };
Packit d0f5c2
Packit d0f5c2
=head1 Defining Encodings
Packit d0f5c2
Packit d0f5c2
To define a new encoding, use:
Packit d0f5c2
Packit d0f5c2
    use Encode qw(define_encoding);
Packit d0f5c2
    define_encoding($object, CANONICAL_NAME [, alias...]);
Packit d0f5c2
Packit d0f5c2
I<CANONICAL_NAME> will be associated with I<$object>.  The object
Packit d0f5c2
should provide the interface described in L<Encode::Encoding>.
Packit d0f5c2
If more than two arguments are provided, additional
Packit d0f5c2
arguments are considered aliases for I<$object>.
Packit d0f5c2
Packit d0f5c2
See L<Encode::Encoding> for details.
Packit d0f5c2
Packit d0f5c2
=head1 The UTF8 flag
Packit d0f5c2
Packit d0f5c2
Before the introduction of Unicode support in Perl, The C<eq> operator
Packit d0f5c2
just compared the strings represented by two scalars. Beginning with
Packit d0f5c2
Perl 5.8, C<eq> compares two strings with simultaneous consideration of
Packit d0f5c2
I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
Packit d0f5c2
I<Programming Perl, 3rd ed.>
Packit d0f5c2
Packit d0f5c2
=over 2
Packit d0f5c2
Packit d0f5c2
=item Goal #1:
Packit d0f5c2
Packit d0f5c2
Old byte-oriented programs should not spontaneously break on the old
Packit d0f5c2
byte-oriented data they used to work on.
Packit d0f5c2
Packit d0f5c2
=item Goal #2:
Packit d0f5c2
Packit d0f5c2
Old byte-oriented programs should magically start working on the new
Packit d0f5c2
character-oriented data when appropriate.
Packit d0f5c2
Packit d0f5c2
=item Goal #3:
Packit d0f5c2
Packit d0f5c2
Programs should run just as fast in the new character-oriented mode
Packit d0f5c2
as in the old byte-oriented mode.
Packit d0f5c2
Packit d0f5c2
=item Goal #4:
Packit d0f5c2
Packit d0f5c2
Perl should remain one language, rather than forking into a
Packit d0f5c2
byte-oriented Perl and a character-oriented Perl.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
Packit d0f5c2
born yet, many features documented in the book remained unimplemented for a
Packit d0f5c2
long time.  Perl 5.8 corrected much of this, and the introduction of the
Packit d0f5c2
UTF8 flag is one of them.  You can think of there being two fundamentally
Packit d0f5c2
different kinds of strings and string-operations in Perl: one a
Packit d0f5c2
byte-oriented mode  for when the internal UTF8 flag is off, and the other a
Packit d0f5c2
character-oriented mode for when the internal UTF8 flag is on.
Packit d0f5c2
Packit d0f5c2
This UTF8 flag is not visible in Perl scripts, exactly for the same reason
Packit d0f5c2
you cannot (or rather, you I<don't have to>) see whether a scalar contains
Packit d0f5c2
a string, an integer, or a floating-point number.   But you can still peek
Packit d0f5c2
and poke these if you will.  See the next section.
Packit d0f5c2
Packit d0f5c2
=head2 Messing with Perl's Internals
Packit d0f5c2
Packit d0f5c2
The following API uses parts of Perl's internals in the current
Packit d0f5c2
implementation.  As such, they are efficient but may change in a future
Packit d0f5c2
release.
Packit d0f5c2
Packit d0f5c2
=head3 is_utf8
Packit d0f5c2
Packit d0f5c2
  is_utf8(STRING [, CHECK])
Packit d0f5c2
Packit d0f5c2
[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
Packit d0f5c2
If I<CHECK> is true, also checks whether I<STRING> contains well-formed
Packit d0f5c2
UTF-8.  Returns true if successful, false otherwise.
Packit d0f5c2
Packit d0f5c2
Typically only necessary for debugging and testing.  Don't use this flag as
Packit d0f5c2
a marker to distinguish character and binary data, that should be decided
Packit d0f5c2
for each variable when you write your code.
Packit d0f5c2
Packit d0f5c2
B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
Packit d0f5c2
I<STRING> is UTF-8 encoded and vice-versa.
Packit d0f5c2
Packit d0f5c2
As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
Packit d0f5c2
Packit d0f5c2
=head3 _utf8_on
Packit d0f5c2
Packit d0f5c2
  _utf8_on(STRING)
Packit d0f5c2
Packit d0f5c2
[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
Packit d0f5c2
is I<not> checked for containing only well-formed UTF-8.  Do not use this
Packit d0f5c2
unless you I<know with absolute certainty> that the STRING holds only
Packit d0f5c2
well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
Packit d0f5c2
don't treat the return value as indicating success or failure), or C<undef>
Packit d0f5c2
if I<STRING> is not a string.
Packit d0f5c2
Packit d0f5c2
B<NOTE>: For security reasons, this function does not work on tainted values.
Packit d0f5c2
Packit d0f5c2
=head3 _utf8_off
Packit d0f5c2
Packit d0f5c2
  _utf8_off(STRING)
Packit d0f5c2
Packit d0f5c2
[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
Packit d0f5c2
frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
Packit d0f5c2
I<STRING> is not a string.  Do not treat the return value as indicative of
Packit d0f5c2
success or failure, because that isn't what it means: it is only the
Packit d0f5c2
previous setting.
Packit d0f5c2
Packit d0f5c2
B<NOTE>: For security reasons, this function does not work on tainted values.
Packit d0f5c2
Packit d0f5c2
=head1 UTF-8 vs. utf8 vs. UTF8
Packit d0f5c2
Packit d0f5c2
  ....We now view strings not as sequences of bytes, but as sequences
Packit d0f5c2
  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
Packit d0f5c2
  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
Packit d0f5c2
Packit d0f5c2
That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
Packit d0f5c2
first conceived by Ken Thompson when he invented it. However, thanks to
Packit d0f5c2
later revisions to the applicable standards, official UTF-8 is now rather
Packit d0f5c2
stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
Packit d0f5c2
to cover only 21 bits instead of 32 or 64 bits) and some sequences
Packit d0f5c2
are not allowed, like those used in surrogate pairs, the 31 non-character
Packit d0f5c2
code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
Packit d0f5c2
(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
Packit d0f5c2
Packit d0f5c2
The former default in which Perl would always use a loose interpretation of
Packit d0f5c2
UTF-8 has now been overruled:
Packit d0f5c2
Packit d0f5c2
  From: Larry Wall <larry@wall.org>
Packit d0f5c2
  Date: December 04, 2004 11:51:58 JST
Packit d0f5c2
  To: perl-unicode@perl.org
Packit d0f5c2
  Subject: Re: Make Encode.pm support the real UTF-8
Packit d0f5c2
  Message-Id: <20041204025158.GA28754@wall.org>
Packit d0f5c2
Packit d0f5c2
  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
Packit d0f5c2
  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
Packit d0f5c2
  : but "UTF-8" is the name of the standard and should give the
Packit d0f5c2
  : corresponding behaviour.
Packit d0f5c2
Packit d0f5c2
  For what it's worth, that's how I've always kept them straight in my
Packit d0f5c2
  head.
Packit d0f5c2
Packit d0f5c2
  Also for what it's worth, Perl 6 will mostly default to strict but
Packit d0f5c2
  make it easy to switch back to lax.
Packit d0f5c2
Packit d0f5c2
  Larry
Packit d0f5c2
Packit d0f5c2
Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
Packit d0f5c2
sense, which is conservative and strict and security-conscious, whereas
Packit d0f5c2
B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
Packit d0f5c2
lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
Packit d0f5c2
important distinction between C<"UTF-8"> and C<"utf8">.
Packit d0f5c2
Packit d0f5c2
  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
Packit d0f5c2
  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
Packit d0f5c2
Packit d0f5c2
In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
Packit d0f5c2
C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
Packit d0f5c2
critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
Packit d0f5c2
Packit d0f5c2
  find_encoding("UTF-8")->name # is 'utf-8-strict'
Packit d0f5c2
  find_encoding("utf-8")->name # ditto. names are case insensitive
Packit d0f5c2
  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
Packit d0f5c2
  find_encoding("UTF8")->name  # is 'utf8'.
Packit d0f5c2
Packit d0f5c2
Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
Packit d0f5c2
whether a string is internally encoded as "utf8", also without a hyphen.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode::Encoding>,
Packit d0f5c2
L<Encode::Supported>,
Packit d0f5c2
L<Encode::PerlIO>,
Packit d0f5c2
L<encoding>,
Packit d0f5c2
L<perlebcdic>,
Packit d0f5c2
L<perlfunc/open>,
Packit d0f5c2
L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
Packit d0f5c2
L<utf8>,
Packit d0f5c2
the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
Packit d0f5c2
Packit d0f5c2
=head1 MAINTAINER
Packit d0f5c2
Packit d0f5c2
This project was originated by the late Nick Ing-Simmons and later
Packit d0f5c2
maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
Packit d0f5c2
for a full list of people involved.  For any questions, send mail to
Packit d0f5c2
I<< <perl-unicode@perl.org> >> so that we can all share.
Packit d0f5c2
Packit d0f5c2
While Dan Kogai retains the copyright as a maintainer, credit
Packit d0f5c2
should go to all those involved.  See AUTHORS for a list of those
Packit d0f5c2
who submitted code to the project.
Packit d0f5c2
Packit d0f5c2
=head1 COPYRIGHT
Packit d0f5c2
Packit d0f5c2
Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
Packit d0f5c2
Packit d0f5c2
This library is free software; you can redistribute it and/or modify
Packit d0f5c2
it under the same terms as Perl itself.
Packit d0f5c2
Packit d0f5c2
=cut