|
Packit |
d0f5c2 |
package Encode::Unicode;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use strict;
|
|
Packit |
d0f5c2 |
use warnings;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
our $VERSION = do { my @r = ( q$Revision: 2.17 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use XSLoader;
|
|
Packit |
d0f5c2 |
XSLoader::load( __PACKAGE__, $VERSION );
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
# Object Generator 8 transcoders all at once!
|
|
Packit |
d0f5c2 |
#
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use Encode ();
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
our %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
for my $name (
|
|
Packit |
d0f5c2 |
qw(UTF-16 UTF-16BE UTF-16LE
|
|
Packit |
d0f5c2 |
UTF-32 UTF-32BE UTF-32LE
|
|
Packit |
d0f5c2 |
UCS-2BE UCS-2LE)
|
|
Packit |
d0f5c2 |
)
|
|
Packit |
d0f5c2 |
{
|
|
Packit |
d0f5c2 |
my ( $size, $endian, $ucs2, $mask );
|
|
Packit |
d0f5c2 |
$name =~ /^(\w+)-(\d+)(\w*)$/o;
|
|
Packit |
d0f5c2 |
if ( $ucs2 = ( $1 eq 'UCS' ) ) {
|
|
Packit |
d0f5c2 |
$size = 2;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
else {
|
|
Packit |
d0f5c2 |
$size = $2 / 8;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
$endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : '';
|
|
Packit |
d0f5c2 |
$size == 4 and $endian = uc($endian);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
my $obj = bless {
|
|
Packit |
d0f5c2 |
Name => $name,
|
|
Packit |
d0f5c2 |
size => $size,
|
|
Packit |
d0f5c2 |
endian => $endian,
|
|
Packit |
d0f5c2 |
ucs2 => $ucs2,
|
|
Packit |
d0f5c2 |
} => __PACKAGE__;
|
|
Packit |
d0f5c2 |
Encode::define_encoding($obj, $name);
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use parent qw(Encode::Encoding);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
sub renew {
|
|
Packit |
d0f5c2 |
my $self = shift;
|
|
Packit |
d0f5c2 |
$BOM_Unknown{ $self->name } or return $self;
|
|
Packit |
d0f5c2 |
my $clone = bless {%$self} => ref($self);
|
|
Packit |
d0f5c2 |
$clone->{renewed}++; # so the caller knows it is renewed.
|
|
Packit |
d0f5c2 |
return $clone;
|
|
Packit |
d0f5c2 |
}
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
1;
|
|
Packit |
d0f5c2 |
__END__
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 NAME
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Encode::Unicode -- Various Unicode Transformation Formats
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=cut
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 SYNOPSIS
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
use Encode qw/encode decode/;
|
|
Packit |
d0f5c2 |
$ucs2 = encode("UCS-2BE", $utf8);
|
|
Packit |
d0f5c2 |
$utf8 = decode("UCS-2BE", $ucs2);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 ABSTRACT
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This module implements all Character Encoding Schemes of Unicode that
|
|
Packit |
d0f5c2 |
are officially documented by Unicode Consortium (except, of course,
|
|
Packit |
d0f5c2 |
for UTF-8, which is a native format in perl).
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item L<http://www.unicode.org/glossary/> says:
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
I<Character Encoding Scheme> A character encoding form plus byte
|
|
Packit |
d0f5c2 |
serialization. There are Seven character encoding schemes in Unicode:
|
|
Packit |
d0f5c2 |
UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
|
|
Packit |
d0f5c2 |
UTF-32LE (UCS-4LE), and UTF-7.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
|
|
Packit |
d0f5c2 |
Unicode's Character Encoding Scheme. It is separately implemented in
|
|
Packit |
d0f5c2 |
Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item Quick Reference
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Decodes from ord(N) Encodes chr(N) to...
|
|
Packit |
d0f5c2 |
octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
|
|
Packit |
d0f5c2 |
---------------+-----------------+------------------------------
|
|
Packit |
d0f5c2 |
UCS-2BE 2 N N is bogus Not Available
|
|
Packit |
d0f5c2 |
UCS-2LE 2 N N bogus Not Available
|
|
Packit |
d0f5c2 |
UTF-16 2/4 Y Y is S.P S.P BE/LE
|
|
Packit |
d0f5c2 |
UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
|
|
Packit |
d0f5c2 |
UTF-16LE 2/4 N Y S.P S.P 0x2ad8,0xcddf
|
|
Packit |
d0f5c2 |
UTF-32 4 Y - is bogus As is BE/LE
|
|
Packit |
d0f5c2 |
UTF-32BE 4 N - bogus As is 0x0001abcd
|
|
Packit |
d0f5c2 |
UTF-32LE 4 N - bogus As is 0xcdab0100
|
|
Packit |
d0f5c2 |
UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
|
|
Packit |
d0f5c2 |
---------------+-----------------+------------------------------
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 Size, Endianness, and BOM
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
You can categorize these CES by 3 criteria: size of each character,
|
|
Packit |
d0f5c2 |
endianness, and Byte Order Mark.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 by size
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
UCS-2 is a fixed-length encoding with each character taking 16 bits.
|
|
Packit |
d0f5c2 |
It B<does not> support I<surrogate pairs>. When a surrogate pair
|
|
Packit |
d0f5c2 |
is encountered during decode(), its place is filled with \x{FFFD}
|
|
Packit |
d0f5c2 |
if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
|
|
Packit |
d0f5c2 |
character whose ord value is larger than 0xFFFF is encountered,
|
|
Packit |
d0f5c2 |
its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
|
|
Packit |
d0f5c2 |
croaks if I<CHECK> is 1.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
|
|
Packit |
d0f5c2 |
When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
|
|
Packit |
d0f5c2 |
following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
|
|
Packit |
d0f5c2 |
form a character. Bogus surrogates result in death. When \x{10000}
|
|
Packit |
d0f5c2 |
or above is encountered during encode(), it C<ensurrogate>s them and
|
|
Packit |
d0f5c2 |
pushes the surrogate pair to the output stream.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
|
|
Packit |
d0f5c2 |
Since it is 32-bit, there is no need for I<surrogate pairs>.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head2 by endianness
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The first (and now failed) goal of Unicode was to map all character
|
|
Packit |
d0f5c2 |
repertoires into a fixed-length integer so that programmers are happy.
|
|
Packit |
d0f5c2 |
Since each character is either a I<short> or I<long> in C, you have to
|
|
Packit |
d0f5c2 |
pay attention to the endianness of each platform when you pass data
|
|
Packit |
d0f5c2 |
to one another.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Anything marked as BE is Big Endian (or network byte order) and LE is
|
|
Packit |
d0f5c2 |
Little Endian (aka VAX byte order). For anything not marked either
|
|
Packit |
d0f5c2 |
BE or LE, a character called Byte Order Mark (BOM) indicating the
|
|
Packit |
d0f5c2 |
endianness is prepended to the string.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless
|
|
Packit |
d0f5c2 |
and as of this writing Encode suite just leave it as is (\x{FeFF}).
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item BOM as integer when fetched in network byte order
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
16 32 bits/char
|
|
Packit |
d0f5c2 |
-------------------------
|
|
Packit |
d0f5c2 |
BE 0xFeFF 0x0000FeFF
|
|
Packit |
d0f5c2 |
LE 0xFFFe 0xFFFe0000
|
|
Packit |
d0f5c2 |
-------------------------
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
This modules handles the BOM as follows.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=over 4
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
When BE or LE is explicitly stated as the name of encoding, BOM is
|
|
Packit |
d0f5c2 |
simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
When BE or LE is omitted during decode(), it checks if BOM is at the
|
|
Packit |
d0f5c2 |
beginning of the string; if one is found, the endianness is set to
|
|
Packit |
d0f5c2 |
what the BOM says.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Default Byte Order
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
When no BOM is found, Encode 2.76 and blow croaked. Since Encode
|
|
Packit |
d0f5c2 |
2.77, it falls back to BE accordingly to RFC2781 and the Unicode
|
|
Packit |
d0f5c2 |
Standard version 8.0
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
When BE or LE is omitted during encode(), it returns a BE-encoded
|
|
Packit |
d0f5c2 |
string with BOM prepended. So when you want to encode a whole text
|
|
Packit |
d0f5c2 |
file, make sure you encode() the whole text at once, not line by line
|
|
Packit |
d0f5c2 |
or each line, not file, will have a BOM prepended.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=item *
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
|
|
Packit |
d0f5c2 |
UCS-2 is already registered by IANA and others that way.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=back
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 Surrogate Pairs
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
To say the least, surrogate pairs were the biggest mistake of the
|
|
Packit |
d0f5c2 |
Unicode Consortium. But according to the late Douglas Adams in I
|
|
Packit |
d0f5c2 |
Hitchhiker's Guide to the Galaxy> Trilogy, C
|
|
Packit |
d0f5c2 |
Universe was created. This has made a lot of people very angry and
|
|
Packit |
d0f5c2 |
been widely regarded as a bad move>. Their mistake was not of this
|
|
Packit |
d0f5c2 |
magnitude so let's forgive them.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
(I don't dare make any comparison with Unicode Consortium and the
|
|
Packit |
d0f5c2 |
Vogons here ;) Or, comparing Encode to Babel Fish is completely
|
|
Packit |
d0f5c2 |
appropriate -- if you can only stick this into your ear :)
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Surrogate pairs were born when the Unicode Consortium finally
|
|
Packit |
d0f5c2 |
admitted that 16 bits were not big enough to hold all the world's
|
|
Packit |
d0f5c2 |
character repertoires. But they already made UCS-2 16-bit. What
|
|
Packit |
d0f5c2 |
do we do?
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Back then, the range 0xD800-0xDFFF was not allocated. Let's split
|
|
Packit |
d0f5c2 |
that range in half and use the first half to represent the C
|
|
Packit |
d0f5c2 |
half of a character> and the second half to represent the C
|
|
Packit |
d0f5c2 |
half of a character>. That way, you can represent 1024 * 1024 =
|
|
Packit |
d0f5c2 |
1048576 more characters. Now we can store character ranges up to
|
|
Packit |
d0f5c2 |
\x{10ffff} even with 16-bit encodings. This pair of half-character is
|
|
Packit |
d0f5c2 |
now called a I<surrogate pair> and UTF-16 is the name of the encoding
|
|
Packit |
d0f5c2 |
that embraces them.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Here is a formula to ensurrogate a Unicode character \x{10000} and
|
|
Packit |
d0f5c2 |
above;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
$hi = ($uni - 0x10000) / 0x400 + 0xD800;
|
|
Packit |
d0f5c2 |
$lo = ($uni - 0x10000) % 0x400 + 0xDC00;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
And to desurrogate;
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
$uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
|
|
Packit |
d0f5c2 |
perl does not prohibit the use of characters within this range. To perl,
|
|
Packit |
d0f5c2 |
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
|
|
Packit |
d0f5c2 |
integer support!
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 Error Checking
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Unlike most encodings which accept various ways to handle errors,
|
|
Packit |
d0f5c2 |
Unicode encodings simply croaks.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
% perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
|
|
Packit |
d0f5c2 |
-e'Encode::from_to($_, "utf16","shift_jis", 0); print'
|
|
Packit |
d0f5c2 |
UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
|
|
Packit |
d0f5c2 |
% perl -MEncode -e'$a = "BOM missing"' \
|
|
Packit |
d0f5c2 |
-e' Encode::from_to($a, "utf16", "shift_jis", 0); print'
|
|
Packit |
d0f5c2 |
UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Unlike other encodings where mappings are not one-to-one against
|
|
Packit |
d0f5c2 |
Unicode, UTFs are supposed to map 100% against one another. So Encode
|
|
Packit |
d0f5c2 |
is more strict on UTFs.
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Consider that "division by zero" of Encode :)
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=head1 SEE ALSO
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
|
|
Packit |
d0f5c2 |
L<http://www.unicode.org/unicode/faq/utf_bom.html>,
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
RFC 2781 L<http://www.ietf.org/rfc/rfc2781.txt>,
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html>
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
|
|
Packit |
d0f5c2 |
by Larry Wall, Tom Christiansen, Jon Orwant;
|
|
Packit |
d0f5c2 |
O'Reilly & Associates; ISBN 0-596-00027-8
|
|
Packit |
d0f5c2 |
|
|
Packit |
d0f5c2 |
=cut
|