Blame lib/Encode/Unicode/UTF7.pm

Packit d0f5c2
#
Packit d0f5c2
# $Id: UTF7.pm,v 2.10 2017/06/10 17:23:50 dankogai Exp $
Packit d0f5c2
#
Packit d0f5c2
package Encode::Unicode::UTF7;
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use parent qw(Encode::Encoding);
Packit d0f5c2
__PACKAGE__->Define('UTF-7');
Packit d0f5c2
our $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
Packit d0f5c2
use MIME::Base64;
Packit d0f5c2
use Encode qw(find_encoding);
Packit d0f5c2
Packit d0f5c2
#
Packit d0f5c2
# Algorithms taken from Unicode::String by Gisle Aas
Packit d0f5c2
#
Packit d0f5c2
Packit d0f5c2
our $OPTIONAL_DIRECT_CHARS = 1;
Packit d0f5c2
my $specials = quotemeta "\'(),-./:?";
Packit d0f5c2
$OPTIONAL_DIRECT_CHARS
Packit d0f5c2
  and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
Packit d0f5c2
Packit d0f5c2
# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
Packit d0f5c2
# We use qr/[\n\r\t\ ] instead
Packit d0f5c2
my $re_asis    = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
Packit d0f5c2
my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
Packit d0f5c2
my $e_utf16    = find_encoding("UTF-16BE");
Packit d0f5c2
Packit d0f5c2
sub needs_lines { 1 }
Packit d0f5c2
Packit d0f5c2
sub encode($$;$) {
Packit d0f5c2
    my ( $obj, $str, $chk ) = @_;
Packit d0f5c2
    return undef unless defined $str;
Packit d0f5c2
    my $len = length($str);
Packit d0f5c2
    pos($str) = 0;
Packit d0f5c2
    my $bytes = substr($str, 0, 0); # to propagate taintedness
Packit d0f5c2
    while ( pos($str) < $len ) {
Packit d0f5c2
        if ( $str =~ /\G($re_asis+)/ogc ) {
Packit d0f5c2
	    my $octets = $1;
Packit d0f5c2
	    utf8::downgrade($octets);
Packit d0f5c2
	    $bytes .= $octets;
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
Packit d0f5c2
            if ( $1 eq "+" ) {
Packit d0f5c2
                $bytes .= "+-";
Packit d0f5c2
            }
Packit d0f5c2
            else {
Packit d0f5c2
                my $s = $1;
Packit d0f5c2
                my $base64 = encode_base64( $e_utf16->encode($s), '' );
Packit d0f5c2
                $base64 =~ s/=+$//;
Packit d0f5c2
                $bytes .= "+$base64-";
Packit d0f5c2
            }
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            die "This should not happen! (pos=" . pos($str) . ")";
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = '' if $chk;
Packit d0f5c2
    return $bytes;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub decode($$;$) {
Packit d0f5c2
    use re 'taint';
Packit d0f5c2
    my ( $obj, $bytes, $chk ) = @_;
Packit d0f5c2
    return undef unless defined $bytes;
Packit d0f5c2
    my $len = length($bytes);
Packit d0f5c2
    my $str = substr($bytes, 0, 0); # to propagate taintedness;
Packit d0f5c2
    pos($bytes) = 0;
Packit d0f5c2
    no warnings 'uninitialized';
Packit d0f5c2
    while ( pos($bytes) < $len ) {
Packit d0f5c2
        if ( $bytes =~ /\G([^+]+)/ogc ) {
Packit d0f5c2
            $str .= $1;
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $bytes =~ /\G\+-/ogc ) {
Packit d0f5c2
            $str .= "+";
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) {
Packit d0f5c2
            my $base64 = $1;
Packit d0f5c2
            my $pad    = length($base64) % 4;
Packit d0f5c2
            $base64 .= "=" x ( 4 - $pad ) if $pad;
Packit d0f5c2
            $str .= $e_utf16->decode( decode_base64($base64) );
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $bytes =~ /\G\+/ogc ) {
Packit d0f5c2
            $^W and warn "Bad UTF7 data escape";
Packit d0f5c2
            $str .= "+";
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            die "This should not happen " . pos($bytes);
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = '' if $chk;
Packit d0f5c2
    return $str;
Packit d0f5c2
}
Packit d0f5c2
1;
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
Encode::Unicode::UTF7 -- UTF-7 encoding
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
    use Encode qw/encode decode/; 
Packit d0f5c2
    $utf7 = encode("UTF-7", $utf8);
Packit d0f5c2
    $utf8 = decode("UTF-7", $ucs2);
Packit d0f5c2
Packit d0f5c2
=head1 ABSTRACT
Packit d0f5c2
Packit d0f5c2
This module implements UTF-7 encoding documented in RFC 2152.  UTF-7,
Packit d0f5c2
as its name suggests, is a 7-bit re-encoded version of UTF-16BE.  It
Packit d0f5c2
is designed to be MTA-safe and expected to be a standard way to
Packit d0f5c2
exchange Unicoded mails via mails.  But with the advent of UTF-8 and
Packit d0f5c2
8-bit compliant MTAs, UTF-7 is hardly ever used.
Packit d0f5c2
Packit d0f5c2
UTF-7 was not supported by Encode until version 1.95 because of that.
Packit d0f5c2
But Unicode::String, a module by Gisle Aas which adds Unicode supports
Packit d0f5c2
to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
Packit d0f5c2
so Encode can supersede Unicode::String 100%.
Packit d0f5c2
Packit d0f5c2
=head1 In Practice
Packit d0f5c2
Packit d0f5c2
When you want to encode Unicode for mails and web pages, however, do
Packit d0f5c2
not use UTF-7 unless you are sure your recipients and readers can
Packit d0f5c2
handle it.  Very few MUAs and WWW Browsers support these days (only
Packit d0f5c2
Mozilla seems to support one).  For general cases, use UTF-8 for
Packit d0f5c2
message body and MIME-Header for header instead.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode>, L<Encode::Unicode>, L<Unicode::String>
Packit d0f5c2
Packit d0f5c2
RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
Packit d0f5c2
Packit d0f5c2
=cut