Blame lib/Encode/GSM0338.pm

Packit d0f5c2
#
Packit d0f5c2
# $Id: GSM0338.pm,v 2.7 2017/06/10 17:23:50 dankogai Exp $
Packit d0f5c2
#
Packit d0f5c2
package Encode::GSM0338;
Packit d0f5c2
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use Carp;
Packit d0f5c2
Packit d0f5c2
use vars qw($VERSION);
Packit d0f5c2
$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
Packit d0f5c2
Packit d0f5c2
use Encode qw(:fallbacks);
Packit d0f5c2
Packit d0f5c2
use parent qw(Encode::Encoding);
Packit d0f5c2
__PACKAGE__->Define('gsm0338');
Packit d0f5c2
Packit d0f5c2
sub needs_lines { 1 }
Packit d0f5c2
sub perlio_ok   { 0 }
Packit d0f5c2
Packit d0f5c2
use utf8;
Packit d0f5c2
our %UNI2GSM = (
Packit d0f5c2
    "\x{0040}" => "\x00",        # COMMERCIAL AT
Packit d0f5c2
    "\x{000A}" => "\x0A",        # LINE FEED
Packit d0f5c2
    "\x{000C}" => "\x1B\x0A",    # FORM FEED
Packit d0f5c2
    "\x{000D}" => "\x0D",        # CARRIAGE RETURN
Packit d0f5c2
    "\x{0020}" => "\x20",        # SPACE
Packit d0f5c2
    "\x{0021}" => "\x21",        # EXCLAMATION MARK
Packit d0f5c2
    "\x{0022}" => "\x22",        # QUOTATION MARK
Packit d0f5c2
    "\x{0023}" => "\x23",        # NUMBER SIGN
Packit d0f5c2
    "\x{0024}" => "\x02",        # DOLLAR SIGN
Packit d0f5c2
    "\x{0025}" => "\x25",        # PERCENT SIGN
Packit d0f5c2
    "\x{0026}" => "\x26",        # AMPERSAND
Packit d0f5c2
    "\x{0027}" => "\x27",        # APOSTROPHE
Packit d0f5c2
    "\x{0028}" => "\x28",        # LEFT PARENTHESIS
Packit d0f5c2
    "\x{0029}" => "\x29",        # RIGHT PARENTHESIS
Packit d0f5c2
    "\x{002A}" => "\x2A",        # ASTERISK
Packit d0f5c2
    "\x{002B}" => "\x2B",        # PLUS SIGN
Packit d0f5c2
    "\x{002C}" => "\x2C",        # COMMA
Packit d0f5c2
    "\x{002D}" => "\x2D",        # HYPHEN-MINUS
Packit d0f5c2
    "\x{002E}" => "\x2E",        # FULL STOP
Packit d0f5c2
    "\x{002F}" => "\x2F",        # SOLIDUS
Packit d0f5c2
    "\x{0030}" => "\x30",        # DIGIT ZERO
Packit d0f5c2
    "\x{0031}" => "\x31",        # DIGIT ONE
Packit d0f5c2
    "\x{0032}" => "\x32",        # DIGIT TWO
Packit d0f5c2
    "\x{0033}" => "\x33",        # DIGIT THREE
Packit d0f5c2
    "\x{0034}" => "\x34",        # DIGIT FOUR
Packit d0f5c2
    "\x{0035}" => "\x35",        # DIGIT FIVE
Packit d0f5c2
    "\x{0036}" => "\x36",        # DIGIT SIX
Packit d0f5c2
    "\x{0037}" => "\x37",        # DIGIT SEVEN
Packit d0f5c2
    "\x{0038}" => "\x38",        # DIGIT EIGHT
Packit d0f5c2
    "\x{0039}" => "\x39",        # DIGIT NINE
Packit d0f5c2
    "\x{003A}" => "\x3A",        # COLON
Packit d0f5c2
    "\x{003B}" => "\x3B",        # SEMICOLON
Packit d0f5c2
    "\x{003C}" => "\x3C",        # LESS-THAN SIGN
Packit d0f5c2
    "\x{003D}" => "\x3D",        # EQUALS SIGN
Packit d0f5c2
    "\x{003E}" => "\x3E",        # GREATER-THAN SIGN
Packit d0f5c2
    "\x{003F}" => "\x3F",        # QUESTION MARK
Packit d0f5c2
    "\x{0041}" => "\x41",        # LATIN CAPITAL LETTER A
Packit d0f5c2
    "\x{0042}" => "\x42",        # LATIN CAPITAL LETTER B
Packit d0f5c2
    "\x{0043}" => "\x43",        # LATIN CAPITAL LETTER C
Packit d0f5c2
    "\x{0044}" => "\x44",        # LATIN CAPITAL LETTER D
Packit d0f5c2
    "\x{0045}" => "\x45",        # LATIN CAPITAL LETTER E
Packit d0f5c2
    "\x{0046}" => "\x46",        # LATIN CAPITAL LETTER F
Packit d0f5c2
    "\x{0047}" => "\x47",        # LATIN CAPITAL LETTER G
Packit d0f5c2
    "\x{0048}" => "\x48",        # LATIN CAPITAL LETTER H
Packit d0f5c2
    "\x{0049}" => "\x49",        # LATIN CAPITAL LETTER I
Packit d0f5c2
    "\x{004A}" => "\x4A",        # LATIN CAPITAL LETTER J
Packit d0f5c2
    "\x{004B}" => "\x4B",        # LATIN CAPITAL LETTER K
Packit d0f5c2
    "\x{004C}" => "\x4C",        # LATIN CAPITAL LETTER L
Packit d0f5c2
    "\x{004D}" => "\x4D",        # LATIN CAPITAL LETTER M
Packit d0f5c2
    "\x{004E}" => "\x4E",        # LATIN CAPITAL LETTER N
Packit d0f5c2
    "\x{004F}" => "\x4F",        # LATIN CAPITAL LETTER O
Packit d0f5c2
    "\x{0050}" => "\x50",        # LATIN CAPITAL LETTER P
Packit d0f5c2
    "\x{0051}" => "\x51",        # LATIN CAPITAL LETTER Q
Packit d0f5c2
    "\x{0052}" => "\x52",        # LATIN CAPITAL LETTER R
Packit d0f5c2
    "\x{0053}" => "\x53",        # LATIN CAPITAL LETTER S
Packit d0f5c2
    "\x{0054}" => "\x54",        # LATIN CAPITAL LETTER T
Packit d0f5c2
    "\x{0055}" => "\x55",        # LATIN CAPITAL LETTER U
Packit d0f5c2
    "\x{0056}" => "\x56",        # LATIN CAPITAL LETTER V
Packit d0f5c2
    "\x{0057}" => "\x57",        # LATIN CAPITAL LETTER W
Packit d0f5c2
    "\x{0058}" => "\x58",        # LATIN CAPITAL LETTER X
Packit d0f5c2
    "\x{0059}" => "\x59",        # LATIN CAPITAL LETTER Y
Packit d0f5c2
    "\x{005A}" => "\x5A",        # LATIN CAPITAL LETTER Z
Packit d0f5c2
    "\x{005F}" => "\x11",        # LOW LINE
Packit d0f5c2
    "\x{0061}" => "\x61",        # LATIN SMALL LETTER A
Packit d0f5c2
    "\x{0062}" => "\x62",        # LATIN SMALL LETTER B
Packit d0f5c2
    "\x{0063}" => "\x63",        # LATIN SMALL LETTER C
Packit d0f5c2
    "\x{0064}" => "\x64",        # LATIN SMALL LETTER D
Packit d0f5c2
    "\x{0065}" => "\x65",        # LATIN SMALL LETTER E
Packit d0f5c2
    "\x{0066}" => "\x66",        # LATIN SMALL LETTER F
Packit d0f5c2
    "\x{0067}" => "\x67",        # LATIN SMALL LETTER G
Packit d0f5c2
    "\x{0068}" => "\x68",        # LATIN SMALL LETTER H
Packit d0f5c2
    "\x{0069}" => "\x69",        # LATIN SMALL LETTER I
Packit d0f5c2
    "\x{006A}" => "\x6A",        # LATIN SMALL LETTER J
Packit d0f5c2
    "\x{006B}" => "\x6B",        # LATIN SMALL LETTER K
Packit d0f5c2
    "\x{006C}" => "\x6C",        # LATIN SMALL LETTER L
Packit d0f5c2
    "\x{006D}" => "\x6D",        # LATIN SMALL LETTER M
Packit d0f5c2
    "\x{006E}" => "\x6E",        # LATIN SMALL LETTER N
Packit d0f5c2
    "\x{006F}" => "\x6F",        # LATIN SMALL LETTER O
Packit d0f5c2
    "\x{0070}" => "\x70",        # LATIN SMALL LETTER P
Packit d0f5c2
    "\x{0071}" => "\x71",        # LATIN SMALL LETTER Q
Packit d0f5c2
    "\x{0072}" => "\x72",        # LATIN SMALL LETTER R
Packit d0f5c2
    "\x{0073}" => "\x73",        # LATIN SMALL LETTER S
Packit d0f5c2
    "\x{0074}" => "\x74",        # LATIN SMALL LETTER T
Packit d0f5c2
    "\x{0075}" => "\x75",        # LATIN SMALL LETTER U
Packit d0f5c2
    "\x{0076}" => "\x76",        # LATIN SMALL LETTER V
Packit d0f5c2
    "\x{0077}" => "\x77",        # LATIN SMALL LETTER W
Packit d0f5c2
    "\x{0078}" => "\x78",        # LATIN SMALL LETTER X
Packit d0f5c2
    "\x{0079}" => "\x79",        # LATIN SMALL LETTER Y
Packit d0f5c2
    "\x{007A}" => "\x7A",        # LATIN SMALL LETTER Z
Packit d0f5c2
    "\x{000C}" => "\x1B\x0A",    # FORM FEED
Packit d0f5c2
    "\x{005B}" => "\x1B\x3C",    # LEFT SQUARE BRACKET
Packit d0f5c2
    "\x{005C}" => "\x1B\x2F",    # REVERSE SOLIDUS
Packit d0f5c2
    "\x{005D}" => "\x1B\x3E",    # RIGHT SQUARE BRACKET
Packit d0f5c2
    "\x{005E}" => "\x1B\x14",    # CIRCUMFLEX ACCENT
Packit d0f5c2
    "\x{007B}" => "\x1B\x28",    # LEFT CURLY BRACKET
Packit d0f5c2
    "\x{007C}" => "\x1B\x40",    # VERTICAL LINE
Packit d0f5c2
    "\x{007D}" => "\x1B\x29",    # RIGHT CURLY BRACKET
Packit d0f5c2
    "\x{007E}" => "\x1B\x3D",    # TILDE
Packit d0f5c2
    "\x{00A0}" => "\x1B",        # NO-BREAK SPACE
Packit d0f5c2
    "\x{00A1}" => "\x40",        # INVERTED EXCLAMATION MARK
Packit d0f5c2
    "\x{00A3}" => "\x01",        # POUND SIGN
Packit d0f5c2
    "\x{00A4}" => "\x24",        # CURRENCY SIGN
Packit d0f5c2
    "\x{00A5}" => "\x03",        # YEN SIGN
Packit d0f5c2
    "\x{00A7}" => "\x5F",        # SECTION SIGN
Packit d0f5c2
    "\x{00BF}" => "\x60",        # INVERTED QUESTION MARK
Packit d0f5c2
    "\x{00C4}" => "\x5B",        # LATIN CAPITAL LETTER A WITH DIAERESIS
Packit d0f5c2
    "\x{00C5}" => "\x0E",        # LATIN CAPITAL LETTER A WITH RING ABOVE
Packit d0f5c2
    "\x{00C6}" => "\x1C",        # LATIN CAPITAL LETTER AE
Packit d0f5c2
    "\x{00C9}" => "\x1F",        # LATIN CAPITAL LETTER E WITH ACUTE
Packit d0f5c2
    "\x{00D1}" => "\x5D",        # LATIN CAPITAL LETTER N WITH TILDE
Packit d0f5c2
    "\x{00D6}" => "\x5C",        # LATIN CAPITAL LETTER O WITH DIAERESIS
Packit d0f5c2
    "\x{00D8}" => "\x0B",        # LATIN CAPITAL LETTER O WITH STROKE
Packit d0f5c2
    "\x{00DC}" => "\x5E",        # LATIN CAPITAL LETTER U WITH DIAERESIS
Packit d0f5c2
    "\x{00DF}" => "\x1E",        # LATIN SMALL LETTER SHARP S
Packit d0f5c2
    "\x{00E0}" => "\x7F",        # LATIN SMALL LETTER A WITH GRAVE
Packit d0f5c2
    "\x{00E4}" => "\x7B",        # LATIN SMALL LETTER A WITH DIAERESIS
Packit d0f5c2
    "\x{00E5}" => "\x0F",        # LATIN SMALL LETTER A WITH RING ABOVE
Packit d0f5c2
    "\x{00E6}" => "\x1D",        # LATIN SMALL LETTER AE
Packit d0f5c2
    #"\x{00E7}" => "\x09",        # LATIN SMALL LETTER C WITH CEDILLA
Packit d0f5c2
    "\x{00C7}" => "\x09",        # LATIN CAPITAL LETTER C WITH CEDILLA
Packit d0f5c2
    "\x{00E8}" => "\x04",        # LATIN SMALL LETTER E WITH GRAVE
Packit d0f5c2
    "\x{00E9}" => "\x05",        # LATIN SMALL LETTER E WITH ACUTE
Packit d0f5c2
    "\x{00EC}" => "\x07",        # LATIN SMALL LETTER I WITH GRAVE
Packit d0f5c2
    "\x{00F1}" => "\x7D",        # LATIN SMALL LETTER N WITH TILDE
Packit d0f5c2
    "\x{00F2}" => "\x08",        # LATIN SMALL LETTER O WITH GRAVE
Packit d0f5c2
    "\x{00F6}" => "\x7C",        # LATIN SMALL LETTER O WITH DIAERESIS
Packit d0f5c2
    "\x{00F8}" => "\x0C",        # LATIN SMALL LETTER O WITH STROKE
Packit d0f5c2
    "\x{00F9}" => "\x06",        # LATIN SMALL LETTER U WITH GRAVE
Packit d0f5c2
    "\x{00FC}" => "\x7E",        # LATIN SMALL LETTER U WITH DIAERESIS
Packit d0f5c2
    "\x{0393}" => "\x13",        # GREEK CAPITAL LETTER GAMMA
Packit d0f5c2
    "\x{0394}" => "\x10",        # GREEK CAPITAL LETTER DELTA
Packit d0f5c2
    "\x{0398}" => "\x19",        # GREEK CAPITAL LETTER THETA
Packit d0f5c2
    "\x{039B}" => "\x14",        # GREEK CAPITAL LETTER LAMDA
Packit d0f5c2
    "\x{039E}" => "\x1A",        # GREEK CAPITAL LETTER XI
Packit d0f5c2
    "\x{03A0}" => "\x16",        # GREEK CAPITAL LETTER PI
Packit d0f5c2
    "\x{03A3}" => "\x18",        # GREEK CAPITAL LETTER SIGMA
Packit d0f5c2
    "\x{03A6}" => "\x12",        # GREEK CAPITAL LETTER PHI
Packit d0f5c2
    "\x{03A8}" => "\x17",        # GREEK CAPITAL LETTER PSI
Packit d0f5c2
    "\x{03A9}" => "\x15",        # GREEK CAPITAL LETTER OMEGA
Packit d0f5c2
    "\x{20AC}" => "\x1B\x65",    # EURO SIGN
Packit d0f5c2
);
Packit d0f5c2
our %GSM2UNI = reverse %UNI2GSM;
Packit d0f5c2
our $ESC    = "\x1b";
Packit d0f5c2
our $ATMARK = "\x40";
Packit d0f5c2
our $FBCHAR = "\x3F";
Packit d0f5c2
our $NBSP   = "\x{00A0}";
Packit d0f5c2
Packit d0f5c2
#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
Packit d0f5c2
Packit d0f5c2
sub decode ($$;$) {
Packit d0f5c2
    my ( $obj, $bytes, $chk ) = @_;
Packit d0f5c2
    return undef unless defined $bytes;
Packit d0f5c2
    my $str = substr($bytes, 0, 0); # to propagate taintedness;
Packit d0f5c2
    while ( length $bytes ) {
Packit d0f5c2
        my $c = substr( $bytes, 0, 1, '' );
Packit d0f5c2
        my $u;
Packit d0f5c2
        if ( $c eq "\x00" ) {
Packit d0f5c2
            my $c2 = substr( $bytes, 0, 1, '' );
Packit d0f5c2
            $u =
Packit d0f5c2
                !length $c2 ? $ATMARK
Packit d0f5c2
              : $c2 eq "\x00" ? "\x{0000}"
Packit d0f5c2
              : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2}
Packit d0f5c2
              : $chk
Packit d0f5c2
              ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
Packit d0f5c2
			       ord($c), ord($c2) )
Packit d0f5c2
              : $ATMARK . $FBCHAR;
Packit d0f5c2
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $c eq $ESC ) {
Packit d0f5c2
            my $c2 = substr( $bytes, 0, 1, '' );
Packit d0f5c2
            $u =
Packit d0f5c2
                exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 }
Packit d0f5c2
              : exists $GSM2UNI{$c2}        ? $NBSP . $GSM2UNI{$c2}
Packit d0f5c2
              : $chk
Packit d0f5c2
              ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode",
Packit d0f5c2
			       ord($c), ord($c2) )
Packit d0f5c2
              : $NBSP . $FBCHAR;
Packit d0f5c2
        }
Packit d0f5c2
        else {
Packit d0f5c2
            $u =
Packit d0f5c2
              exists $GSM2UNI{$c}
Packit d0f5c2
              ? $GSM2UNI{$c}
Packit d0f5c2
              : $chk ? ref $chk eq 'CODE'
Packit d0f5c2
                  ? $chk->( ord $c )
Packit d0f5c2
                  : croak sprintf( "\\x%02X does not map to Unicode", ord($c) )
Packit d0f5c2
              : $FBCHAR;
Packit d0f5c2
        }
Packit d0f5c2
        $str .= $u;
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = $bytes if $chk;
Packit d0f5c2
    return $str;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
Packit d0f5c2
Packit d0f5c2
sub encode($$;$) {
Packit d0f5c2
    my ( $obj, $str, $chk ) = @_;
Packit d0f5c2
    return undef unless defined $str;
Packit d0f5c2
    my $bytes = substr($str, 0, 0); # to propagate taintedness
Packit d0f5c2
    while ( length $str ) {
Packit d0f5c2
        my $u = substr( $str, 0, 1, '' );
Packit d0f5c2
        my $c;
Packit d0f5c2
        $bytes .=
Packit d0f5c2
          exists $UNI2GSM{$u}
Packit d0f5c2
          ? $UNI2GSM{$u}
Packit d0f5c2
          : $chk ? ref $chk eq 'CODE'
Packit d0f5c2
              ? $chk->( ord($u) )
Packit d0f5c2
              : croak sprintf( "\\x{%04x} does not map to %s", 
Packit d0f5c2
			       ord($u), $obj->name )
Packit d0f5c2
          : $FBCHAR;
Packit d0f5c2
    }
Packit d0f5c2
    $_[1] = $str if $chk;
Packit d0f5c2
    return $bytes;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
1;
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
Encode::GSM0338 -- ESTI GSM 03.38 Encoding
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  use Encode qw/encode decode/; 
Packit d0f5c2
  $gsm0338 = encode("gsm0338", $utf8);    # loads Encode::GSM0338 implicitly
Packit d0f5c2
  $utf8    = decode("gsm0338", $gsm0338); # ditto
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
Packit d0f5c2
control character ranges and other parts are mapped very differently,
Packit d0f5c2
mainly to store Greek characters.  There are also escape sequences
Packit d0f5c2
(starting with 0x1B) to cover e.g. the Euro sign.
Packit d0f5c2
Packit d0f5c2
This was once handled by L<Encode::Bytes> but because of all those
Packit d0f5c2
unusual specifications, Encode 2.20 has relocated the support to
Packit d0f5c2
this module.
Packit d0f5c2
Packit d0f5c2
=head1 NOTES
Packit d0f5c2
Packit d0f5c2
Unlike most other encodings,  the following always croaks on error
Packit d0f5c2
for any $chk that evaluates to true.
Packit d0f5c2
Packit d0f5c2
  $gsm0338 = encode("gsm0338", $utf8      $chk);
Packit d0f5c2
  $utf8    = decode("gsm0338", $gsm0338,  $chk);
Packit d0f5c2
Packit d0f5c2
So if you want to check the validity of the encoding, surround the
Packit d0f5c2
expression with C<eval {}> block as follows;
Packit d0f5c2
Packit d0f5c2
  eval {
Packit d0f5c2
    $utf8    = decode("gsm0338", $gsm0338,  $chk);
Packit d0f5c2
  } or do {
Packit d0f5c2
    # handle exception here
Packit d0f5c2
  };
Packit d0f5c2
Packit d0f5c2
=head1 BUGS
Packit d0f5c2
Packit d0f5c2
ESTI GSM 03.38 Encoding itself.
Packit d0f5c2
Packit d0f5c2
Mapping \x00 to '@' causes too much pain everywhere.
Packit d0f5c2
Packit d0f5c2
Its use of \x1b (escape) is also very questionable.  
Packit d0f5c2
Packit d0f5c2
Because of those two, the code paging approach used use in ucm-based
Packit d0f5c2
Encoding SOMETIMES fails so this module was written.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode>
Packit d0f5c2
Packit d0f5c2
=cut