Blame lib/Encode/Guess.pm

Packit d0f5c2
package Encode::Guess;
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use Encode qw(:fallbacks find_encoding);
Packit d0f5c2
our $VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
Packit d0f5c2
Packit d0f5c2
my $Canon = 'Guess';
Packit d0f5c2
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
Packit d0f5c2
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
Packit d0f5c2
my $obj = bless {
Packit d0f5c2
    Name     => $Canon,
Packit d0f5c2
    Suspects => {%DEF_SUSPECTS},
Packit d0f5c2
} => __PACKAGE__;
Packit d0f5c2
Encode::define_encoding($obj, $Canon);
Packit d0f5c2
Packit d0f5c2
use parent qw(Encode::Encoding);
Packit d0f5c2
sub needs_lines { 1 }
Packit d0f5c2
sub perlio_ok   { 0 }
Packit d0f5c2
Packit d0f5c2
our @EXPORT         = qw(guess_encoding);
Packit d0f5c2
our $NoUTFAutoGuess = 0;
Packit d0f5c2
our $UTF8_BOM       = pack( "C3", 0xef, 0xbb, 0xbf );
Packit d0f5c2
Packit d0f5c2
sub import {    # Exporter not used so we do it on our own
Packit d0f5c2
    my $callpkg = caller;
Packit d0f5c2
    for my $item (@EXPORT) {
Packit d0f5c2
        no strict 'refs';
Packit d0f5c2
        *{"$callpkg\::$item"} = \&{"$item"};
Packit d0f5c2
    }
Packit d0f5c2
    set_suspects(@_);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub set_suspects {
Packit d0f5c2
    my $class = shift;
Packit d0f5c2
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
Packit d0f5c2
    $self->{Suspects} = {%DEF_SUSPECTS};
Packit d0f5c2
    $self->add_suspects(@_);
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub add_suspects {
Packit d0f5c2
    my $class = shift;
Packit d0f5c2
    my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
Packit d0f5c2
    for my $c (@_) {
Packit d0f5c2
        my $e = find_encoding($c) or die "Unknown encoding: $c";
Packit d0f5c2
        $self->{Suspects}{ $e->name } = $e;
Packit d0f5c2
        DEBUG and warn "Added: ", $e->name;
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub decode($$;$) {
Packit d0f5c2
    my ( $obj, $octet, $chk ) = @_;
Packit d0f5c2
    my $guessed = guess( $obj, $octet );
Packit d0f5c2
    unless ( ref($guessed) ) {
Packit d0f5c2
        require Carp;
Packit d0f5c2
        Carp::croak($guessed);
Packit d0f5c2
    }
Packit d0f5c2
    my $utf8 = $guessed->decode( $octet, $chk || 0 );
Packit d0f5c2
    $_[1] = $octet if $chk;
Packit d0f5c2
    return $utf8;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub guess_encoding {
Packit d0f5c2
    guess( $Encode::Encoding{$Canon}, @_ );
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub guess {
Packit d0f5c2
    my $class = shift;
Packit d0f5c2
    my $obj   = ref($class) ? $class : $Encode::Encoding{$Canon};
Packit d0f5c2
    my $octet = shift;
Packit d0f5c2
Packit d0f5c2
    # sanity check
Packit d0f5c2
    return "Empty string, empty guess" unless defined $octet and length $octet;
Packit d0f5c2
Packit d0f5c2
    # cheat 0: utf8 flag;
Packit d0f5c2
    if ( Encode::is_utf8($octet) ) {
Packit d0f5c2
        return find_encoding('utf8') unless $NoUTFAutoGuess;
Packit d0f5c2
        Encode::_utf8_off($octet);
Packit d0f5c2
    }
Packit d0f5c2
Packit d0f5c2
    # cheat 1: BOM
Packit d0f5c2
    use Encode::Unicode;
Packit d0f5c2
    unless ($NoUTFAutoGuess) {
Packit d0f5c2
        my $BOM = pack( 'C3', unpack( "C3", $octet ) );
Packit d0f5c2
        return find_encoding('utf8')
Packit d0f5c2
          if ( defined $BOM and $BOM eq $UTF8_BOM );
Packit d0f5c2
        $BOM = unpack( 'N', $octet );
Packit d0f5c2
        return find_encoding('UTF-32')
Packit d0f5c2
          if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) );
Packit d0f5c2
        $BOM = unpack( 'n', $octet );
Packit d0f5c2
        return find_encoding('UTF-16')
Packit d0f5c2
          if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) );
Packit d0f5c2
        if ( $octet =~ /\x00/o )
Packit d0f5c2
        {    # if \x00 found, we assume UTF-(16|32)(BE|LE)
Packit d0f5c2
            my $utf;
Packit d0f5c2
            my ( $be, $le ) = ( 0, 0 );
Packit d0f5c2
            if ( $octet =~ /\x00\x00/o ) {    # UTF-32(BE|LE) assumed
Packit d0f5c2
                $utf = "UTF-32";
Packit d0f5c2
                for my $char ( unpack( 'N*', $octet ) ) {
Packit d0f5c2
                    $char & 0x0000ffff and $be++;
Packit d0f5c2
                    $char & 0xffff0000 and $le++;
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
            else {                            # UTF-16(BE|LE) assumed
Packit d0f5c2
                $utf = "UTF-16";
Packit d0f5c2
                for my $char ( unpack( 'n*', $octet ) ) {
Packit d0f5c2
                    $char & 0x00ff and $be++;
Packit d0f5c2
                    $char & 0xff00 and $le++;
Packit d0f5c2
                }
Packit d0f5c2
            }
Packit d0f5c2
            DEBUG and warn "$utf, be == $be, le == $le";
Packit d0f5c2
            $be == $le
Packit d0f5c2
              and return
Packit d0f5c2
              "Encodings ambiguous between $utf BE and LE ($be, $le)";
Packit d0f5c2
            $utf .= ( $be > $le ) ? 'BE' : 'LE';
Packit d0f5c2
            return find_encoding($utf);
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    my %try = %{ $obj->{Suspects} };
Packit d0f5c2
    for my $c (@_) {
Packit d0f5c2
        my $e = find_encoding($c) or die "Unknown encoding: $c";
Packit d0f5c2
        $try{ $e->name } = $e;
Packit d0f5c2
        DEBUG and warn "Added: ", $e->name;
Packit d0f5c2
    }
Packit d0f5c2
    my $nline = 1;
Packit d0f5c2
    for my $line ( split /\r\n?|\n/, $octet ) {
Packit d0f5c2
Packit d0f5c2
        # cheat 2 -- \e in the string
Packit d0f5c2
        if ( $line =~ /\e/o ) {
Packit d0f5c2
            my @keys = keys %try;
Packit d0f5c2
            delete @try{qw/utf8 ascii/};
Packit d0f5c2
            for my $k (@keys) {
Packit d0f5c2
                ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k};
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        my %ok = %try;
Packit d0f5c2
Packit d0f5c2
        # warn join(",", keys %try);
Packit d0f5c2
        for my $k ( keys %try ) {
Packit d0f5c2
            my $scratch = $line;
Packit d0f5c2
            $try{$k}->decode( $scratch, FB_QUIET );
Packit d0f5c2
            if ( $scratch eq '' ) {
Packit d0f5c2
                DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k );
Packit d0f5c2
            }
Packit d0f5c2
            else {
Packit d0f5c2
                use bytes ();
Packit d0f5c2
                DEBUG
Packit d0f5c2
                  and warn sprintf( "%4d:%-24s not ok; %d bytes left\n",
Packit d0f5c2
                    $nline, $k, bytes::length($scratch) );
Packit d0f5c2
                delete $ok{$k};
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        %ok or return "No appropriate encodings found!";
Packit d0f5c2
        if ( scalar( keys(%ok) ) == 1 ) {
Packit d0f5c2
            my ($retval) = values(%ok);
Packit d0f5c2
            return $retval;
Packit d0f5c2
        }
Packit d0f5c2
        %try = %ok;
Packit d0f5c2
        $nline++;
Packit d0f5c2
    }
Packit d0f5c2
    $try{ascii}
Packit d0f5c2
      or return "Encodings too ambiguous: ", join( " or ", keys %try );
Packit d0f5c2
    return $try{ascii};
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
1;
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
Encode::Guess -- Guesses encoding from data
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  # if you are sure $data won't contain anything bogus
Packit d0f5c2
Packit d0f5c2
  use Encode;
Packit d0f5c2
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
Packit d0f5c2
  my $utf8 = decode("Guess", $data);
Packit d0f5c2
  my $data = encode("Guess", $utf8);   # this doesn't work!
Packit d0f5c2
Packit d0f5c2
  # more elaborate way
Packit d0f5c2
  use Encode::Guess;
Packit d0f5c2
  my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
Packit d0f5c2
  ref($enc) or die "Can't guess: $enc"; # trap error this way
Packit d0f5c2
  $utf8 = $enc->decode($data);
Packit d0f5c2
  # or
Packit d0f5c2
  $utf8 = decode($enc->name, $data)
Packit d0f5c2
Packit d0f5c2
=head1 ABSTRACT
Packit d0f5c2
Packit d0f5c2
Encode::Guess enables you to guess in what encoding a given data is
Packit d0f5c2
encoded, or at least tries to.  
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
Packit d0f5c2
Packit d0f5c2
  use Encode::Guess; # ascii/utf8/BOMed UTF
Packit d0f5c2
Packit d0f5c2
To use it more practically, you have to give the names of encodings to
Packit d0f5c2
check (I<suspects> as follows).  The name of suspects can either be
Packit d0f5c2
canonical names or aliases.
Packit d0f5c2
Packit d0f5c2
CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
Packit d0f5c2
Packit d0f5c2
 # tries all major Japanese Encodings as well
Packit d0f5c2
  use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
Packit d0f5c2
Packit d0f5c2
If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
Packit d0f5c2
value, no heuristics will be applied to UTF8/16/32, and the result
Packit d0f5c2
will be limited to the suspects and C<ascii>.
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item Encode::Guess->set_suspects
Packit d0f5c2
Packit d0f5c2
You can also change the internal suspects list via C<set_suspects>
Packit d0f5c2
method. 
Packit d0f5c2
Packit d0f5c2
  use Encode::Guess;
Packit d0f5c2
  Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
Packit d0f5c2
Packit d0f5c2
=item Encode::Guess->add_suspects
Packit d0f5c2
Packit d0f5c2
Or you can use C<add_suspects> method.  The difference is that
Packit d0f5c2
C<set_suspects> flushes the current suspects list while
Packit d0f5c2
C<add_suspects> adds.
Packit d0f5c2
Packit d0f5c2
  use Encode::Guess;
Packit d0f5c2
  Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
Packit d0f5c2
  # now the suspects are euc-jp,shiftjis,7bit-jis, AND
Packit d0f5c2
  # euc-kr,euc-cn, and big5-eten
Packit d0f5c2
  Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
Packit d0f5c2
Packit d0f5c2
=item Encode::decode("Guess" ...)
Packit d0f5c2
Packit d0f5c2
When you are content with suspects list, you can now
Packit d0f5c2
Packit d0f5c2
  my $utf8 = Encode::decode("Guess", $data);
Packit d0f5c2
Packit d0f5c2
=item Encode::Guess->guess($data)
Packit d0f5c2
Packit d0f5c2
But it will croak if:
Packit d0f5c2
Packit d0f5c2
=over
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Two or more suspects remain
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
No suspects left
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
So you should instead try this;
Packit d0f5c2
Packit d0f5c2
  my $decoder = Encode::Guess->guess($data);
Packit d0f5c2
Packit d0f5c2
On success, $decoder is an object that is documented in
Packit d0f5c2
L<Encode::Encoding>.  So you can now do this;
Packit d0f5c2
Packit d0f5c2
  my $utf8 = $decoder->decode($data);
Packit d0f5c2
Packit d0f5c2
On failure, $decoder now contains an error message so the whole thing
Packit d0f5c2
would be as follows;
Packit d0f5c2
Packit d0f5c2
  my $decoder = Encode::Guess->guess($data);
Packit d0f5c2
  die $decoder unless ref($decoder);
Packit d0f5c2
  my $utf8 = $decoder->decode($data);
Packit d0f5c2
Packit d0f5c2
=item guess_encoding($data, [, I<list of suspects>])
Packit d0f5c2
Packit d0f5c2
You can also try C<guess_encoding> function which is exported by
Packit d0f5c2
default.  It takes $data to check and it also takes the list of
Packit d0f5c2
suspects by option.  The optional suspect list is I<not reflected> to
Packit d0f5c2
the internal suspects list.
Packit d0f5c2
Packit d0f5c2
  my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
Packit d0f5c2
  die $decoder unless ref($decoder);
Packit d0f5c2
  my $utf8 = $decoder->decode($data);
Packit d0f5c2
  # check only ascii, utf8 and UTF-(16|32) with BOM
Packit d0f5c2
  my $decoder = guess_encoding($data);
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 CAVEATS
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Because of the algorithm used, ISO-8859 series and other single-byte
Packit d0f5c2
encodings do not work well unless either one of ISO-8859 is the only
Packit d0f5c2
one suspect (besides ascii and utf8).
Packit d0f5c2
Packit d0f5c2
  use Encode::Guess;
Packit d0f5c2
  # perhaps ok
Packit d0f5c2
  my $decoder = guess_encoding($data, 'latin1');
Packit d0f5c2
  # definitely NOT ok
Packit d0f5c2
  my $decoder = guess_encoding($data, qw/latin1 greek/);
Packit d0f5c2
Packit d0f5c2
The reason is that Encode::Guess guesses encoding by trial and error.
Packit d0f5c2
It first splits $data into lines and tries to decode the line for each
Packit d0f5c2
suspect.  It keeps it going until all but one encoding is eliminated
Packit d0f5c2
out of suspects list.  ISO-8859 series is just too successful for most
Packit d0f5c2
cases (because it fills almost all code points in \x00-\xff).
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Do not mix national standard encodings and the corresponding vendor
Packit d0f5c2
encodings.
Packit d0f5c2
Packit d0f5c2
  # a very bad idea
Packit d0f5c2
  my $decoder
Packit d0f5c2
     = guess_encoding($data, qw/shiftjis MacJapanese cp932/);
Packit d0f5c2
Packit d0f5c2
The reason is that vendor encoding is usually a superset of national
Packit d0f5c2
standard so it becomes too ambiguous for most cases.
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
On the other hand, mixing various national standard encodings
Packit d0f5c2
automagically works unless $data is too short to allow for guessing.
Packit d0f5c2
Packit d0f5c2
 # This is ok if $data is long enough
Packit d0f5c2
 my $decoder =  
Packit d0f5c2
  guess_encoding($data, qw/euc-cn
Packit d0f5c2
                           euc-jp shiftjis 7bit-jis
Packit d0f5c2
                           euc-kr
Packit d0f5c2
                           big5-eten/);
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
DO NOT PUT TOO MANY SUSPECTS!  Don't you try something like this!
Packit d0f5c2
Packit d0f5c2
  my $decoder = guess_encoding($data, 
Packit d0f5c2
                               Encode->encodings(":all"));
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
It is, after all, just a guess.  You should alway be explicit when it
Packit d0f5c2
comes to encodings.  But there are some, especially Japanese,
Packit d0f5c2
environment that guess-coding is a must.  Use this module with care. 
Packit d0f5c2
Packit d0f5c2
=head1 TO DO
Packit d0f5c2
Packit d0f5c2
Encode::Guess does not work on EBCDIC platforms.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode>, L<Encode::Encoding>
Packit d0f5c2
Packit d0f5c2
=cut
Packit d0f5c2