|
Packit |
d0f5c2 |
package Encode::Encoding;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
# Base class for classes which implement encodings
|
|
Packit |
d0f5c2 |
use strict;
|
|
Packit |
d0f5c2 |
use warnings;
|
|
Packit |
d0f5c2 |
our $VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
our @CARP_NOT = qw(Encode Encode::Encoder);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use Carp ();
|
|
Packit |
d0f5c2 |
use Encode ();
|
|
Packit |
d0f5c2 |
use Encode::MIME::Name;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub Define {
|
|
Packit |
d0f5c2 |
my $obj = shift;
|
|
Packit |
d0f5c2 |
my $canonical = shift;
|
|
Packit |
d0f5c2 |
$obj = bless { Name => $canonical }, $obj unless ref $obj;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
# warn "$canonical => $obj\n";
|
|
Packit |
d0f5c2 |
Encode::define_encoding( $obj, $canonical, @_ );
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub name { return shift->{'Name'} }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub mime_name {
|
|
Packit |
d0f5c2 |
return Encode::MIME::Name::get_mime_name(shift->name);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub renew {
|
|
Packit |
d0f5c2 |
my $self = shift;
|
|
Packit |
d0f5c2 |
my $clone = bless {%$self} => ref($self);
|
|
Packit |
d0f5c2 |
$clone->{renewed}++; # so the caller can see it
|
|
Packit |
d0f5c2 |
DEBUG and warn $clone->{renewed};
|
|
Packit |
d0f5c2 |
return $clone;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub renewed { return $_[0]->{renewed} || 0 }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
*new_sequence = \&renew;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub needs_lines { 0 }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub perlio_ok {
|
|
Packit |
d0f5c2 |
return eval { require PerlIO::encoding } ? 1 : 0;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
# (Temporary|legacy) methods
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub toUnicode { shift->decode(@_) }
|
|
Packit |
d0f5c2 |
sub fromUnicode { shift->encode(@_) }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
# Needs to be overloaded or just croak
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub encode {
|
|
Packit |
d0f5c2 |
my $obj = shift;
|
|
Packit |
d0f5c2 |
my $class = ref($obj) ? ref($obj) : $obj;
|
|
Packit |
d0f5c2 |
Carp::croak( $class . "->encode() not defined!" );
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub decode {
|
|
Packit |
d0f5c2 |
my $obj = shift;
|
|
Packit |
d0f5c2 |
my $class = ref($obj) ? ref($obj) : $obj;
|
|
Packit |
d0f5c2 |
Carp::croak( $class . "->encode() not defined!" );
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub DESTROY { }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
1;
|
|
Packit |
d0f5c2 |
__END__
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 NAME
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Encode::Encoding - Encode Implementation Base Class
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 SYNOPSIS
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
package Encode::MyEncoding;
|
|
Packit |
d0f5c2 |
use parent qw(Encode::Encoding);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
__PACKAGE__->Define(qw(myCanonical myAlias));
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 DESCRIPTION
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
As mentioned in L<Encode>, encodings are (in the current
|
|
Packit |
d0f5c2 |
implementation at least) defined as objects. The mapping of encoding
|
|
Packit |
d0f5c2 |
name to object is via the C<%Encode::Encoding> hash. Though you can
|
|
Packit |
d0f5c2 |
directly manipulate this hash, it is strongly encouraged to use this
|
|
Packit |
d0f5c2 |
base class module and add encode() and decode() methods.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 Methods you should implement
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
You are strongly encouraged to implement methods below, at least
|
|
Packit |
d0f5c2 |
either encode() or decode().
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>encode($string [,$check])
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MUST return the octet sequence representing I<$string>.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 2
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If I<$check> is true, it SHOULD modify I<$string> in place to remove
|
|
Packit |
d0f5c2 |
the converted part (i.e. the whole string unless there is an error).
|
|
Packit |
d0f5c2 |
If perlio_ok() is true, SHOULD becomes MUST.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If an error occurs, it SHOULD return the octet sequence for the
|
|
Packit |
d0f5c2 |
fragment of string that has been converted and modify $string in-place
|
|
Packit |
d0f5c2 |
to remove the converted part leaving it starting with the problem
|
|
Packit |
d0f5c2 |
fragment. If perlio_ok() is true, SHOULD becomes MUST.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If I<$check> is false then C<encode> MUST make a "best effort" to
|
|
Packit |
d0f5c2 |
convert the string - for example, by using a replacement character.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>decode($octets [,$check])
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MUST return the string that I<$octets> represents.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 2
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If I<$check> is true, it SHOULD modify I<$octets> in place to remove
|
|
Packit |
d0f5c2 |
the converted part (i.e. the whole sequence unless there is an
|
|
Packit |
d0f5c2 |
error). If perlio_ok() is true, SHOULD becomes MUST.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If an error occurs, it SHOULD return the fragment of string that has
|
|
Packit |
d0f5c2 |
been converted and modify $octets in-place to remove the converted
|
|
Packit |
d0f5c2 |
part leaving it starting with the problem fragment. If perlio_ok() is
|
|
Packit |
d0f5c2 |
true, SHOULD becomes MUST.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If I<$check> is false then C<decode> should make a "best effort" to
|
|
Packit |
d0f5c2 |
convert the string - for example by using Unicode's "\x{FFFD}" as a
|
|
Packit |
d0f5c2 |
replacement character.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If you want your encoding to work with L<encoding> pragma, you should
|
|
Packit |
d0f5c2 |
also implement the method below.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
|
|
Packit |
d0f5c2 |
Decoding will terminate when $terminator (a string) appears in output.
|
|
Packit |
d0f5c2 |
I<$offset> will be modified to the last $octets position at end of decode.
|
|
Packit |
d0f5c2 |
Returns true if $terminator appears output, else returns false.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 Other methods defined in Encode::Encodings
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
You do not have to override methods shown below unless you have to.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>name
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub name { return shift->{'Name'} }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MUST return the string representing the canonical name of the encoding.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>mime_name
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub mime_name{
|
|
Packit |
d0f5c2 |
return Encode::MIME::Name::get_mime_name(shift->name);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
MUST return the string representing the IANA charset name of the encoding.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>renew
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub renew {
|
|
Packit |
d0f5c2 |
my $self = shift;
|
|
Packit |
d0f5c2 |
my $clone = bless { %$self } => ref($self);
|
|
Packit |
d0f5c2 |
$clone->{renewed}++;
|
|
Packit |
d0f5c2 |
return $clone;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This method reconstructs the encoding object if necessary. If you need
|
|
Packit |
d0f5c2 |
to store the state during encoding, this is where you clone your object.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
PerlIO ALWAYS calls this method to make sure it has its own private
|
|
Packit |
d0f5c2 |
encoding object.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>renewed
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub renewed { $_[0]->{renewed} || 0 }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Tells whether the object is renewed (and how many times). Some
|
|
Packit |
d0f5c2 |
modules emit C<Use of uninitialized value in null operation> warning
|
|
Packit |
d0f5c2 |
unless the value is numeric so return 0 for false.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>perlio_ok()
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub perlio_ok {
|
|
Packit |
d0f5c2 |
return eval { require PerlIO::encoding } ? 1 : 0;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If your encoding does not support PerlIO for some reasons, just;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub perlio_ok { 0 }
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item -E<gt>needs_lines()
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Predefined As:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub needs_lines { 0 };
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
If your encoding can work with PerlIO but needs line buffering, you
|
|
Packit |
d0f5c2 |
MUST define this method so it returns true. 7bit ISO-2022 encodings
|
|
Packit |
d0f5c2 |
are one example that needs this. When this method is missing, false
|
|
Packit |
d0f5c2 |
is assumed.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 Example: Encode::ROT13
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
package Encode::ROT13;
|
|
Packit |
d0f5c2 |
use strict;
|
|
Packit |
d0f5c2 |
use parent qw(Encode::Encoding);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
__PACKAGE__->Define('rot13');
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub encode($$;$){
|
|
Packit |
d0f5c2 |
my ($obj, $str, $chk) = @_;
|
|
Packit |
d0f5c2 |
$str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
|
|
Packit |
d0f5c2 |
$_[1] = '' if $chk; # this is what in-place edit means
|
|
Packit |
d0f5c2 |
return $str;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
# Jr pna or ynml yvxr guvf;
|
|
Packit |
d0f5c2 |
*decode = \&encode;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
1;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 Why the heck Encode API is different?
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
It should be noted that the I<$check> behaviour is different from the
|
|
Packit |
d0f5c2 |
outer public API. The logic is that the "unchecked" case is useful
|
|
Packit |
d0f5c2 |
when the encoding is part of a stream which may be reporting errors
|
|
Packit |
d0f5c2 |
(e.g. STDERR). In such cases, it is desirable to get everything
|
|
Packit |
d0f5c2 |
through somehow without causing additional errors which obscure the
|
|
Packit |
d0f5c2 |
original one. Also, the encoding is best placed to know what the
|
|
Packit |
d0f5c2 |
correct replacement character is, so if that is the desired behaviour
|
|
Packit |
d0f5c2 |
then letting low level code do it is the most efficient.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
By contrast, if I<$check> is true, the scheme above allows the
|
|
Packit |
d0f5c2 |
encoding to do as much as it can and tell the layer above how much
|
|
Packit |
d0f5c2 |
that was. What is lacking at present is a mechanism to report what
|
|
Packit |
d0f5c2 |
went wrong. The most likely interface will be an additional method
|
|
Packit |
d0f5c2 |
call to the object, or perhaps (to avoid forcing per-stream objects
|
|
Packit |
d0f5c2 |
on otherwise stateless encodings) an additional parameter.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
It is also highly desirable that encoding classes inherit from
|
|
Packit |
d0f5c2 |
C<Encode::Encoding> as a base class. This allows that class to define
|
|
Packit |
d0f5c2 |
additional behaviour for all encoding objects.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
package Encode::MyEncoding;
|
|
Packit |
d0f5c2 |
use parent qw(Encode::Encoding);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
__PACKAGE__->Define(qw(myCanonical myAlias));
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
to create an object with C<< bless {Name => ...}, $class >>, and call
|
|
Packit |
d0f5c2 |
define_encoding. They inherit their C<name> method from
|
|
Packit |
d0f5c2 |
C<Encode::Encoding>.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 Compiled Encodings
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
For the sake of speed and efficiency, most of the encodings are now
|
|
Packit |
d0f5c2 |
supported via a I<compiled form>: XS modules generated from UCM
|
|
Packit |
d0f5c2 |
files. Encode provides the enc2xs tool to achieve that. Please see
|
|
Packit |
d0f5c2 |
L<enc2xs> for more details.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 SEE ALSO
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
L<perlmod>, L<enc2xs>
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=begin future
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item Scheme 1
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The fixup routine gets passed the remaining fragment of string being
|
|
Packit |
d0f5c2 |
processed. It modifies it in place to remove bytes/characters it can
|
|
Packit |
d0f5c2 |
understand and returns a string used to represent them. For example:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub fixup {
|
|
Packit |
d0f5c2 |
my $ch = substr($_[0],0,1,'');
|
|
Packit |
d0f5c2 |
return sprintf("\x{%02X}",ord($ch);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This scheme is close to how the underlying C code for Encode works,
|
|
Packit |
d0f5c2 |
but gives the fixup routine very little context.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item Scheme 2
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The fixup routine gets passed the original string, an index into
|
|
Packit |
d0f5c2 |
it of the problem area, and the output string so far. It appends
|
|
Packit |
d0f5c2 |
what it wants to the output string and returns a new index into the
|
|
Packit |
d0f5c2 |
original string. For example:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub fixup {
|
|
Packit |
d0f5c2 |
# my ($s,$i,$d) = @_;
|
|
Packit |
d0f5c2 |
my $ch = substr($_[0],$_[1],1);
|
|
Packit |
d0f5c2 |
$_[2] .= sprintf("\x{%02X}",ord($ch);
|
|
Packit |
d0f5c2 |
return $_[1]+1;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This scheme gives maximal control to the fixup routine but is more
|
|
Packit |
d0f5c2 |
complicated to code, and may require that the internals of Encode be tweaked to
|
|
Packit |
d0f5c2 |
keep the original string intact.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item Other Schemes
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Hybrids of the above.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Multiple return values rather than in-place modifications.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Index into the string could be C<pos($str)> allowing C<s/\G...//>.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=end future
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=cut
|