Blame bin/piconv

Packit d0f5c2
#!./perl
Packit d0f5c2
# $Id: piconv,v 2.8 2016/08/04 03:15:58 dankogai Exp $
Packit d0f5c2
#
Packit d0f5c2
BEGIN { pop @INC if $INC[-1] eq '.' }
Packit d0f5c2
use 5.8.0;
Packit d0f5c2
use strict;
Packit d0f5c2
use Encode ;
Packit d0f5c2
use Encode::Alias;
Packit d0f5c2
my %Scheme =  map {$_ => 1} qw(from_to decode_encode perlio);
Packit d0f5c2
Packit d0f5c2
use File::Basename;
Packit d0f5c2
my $name = basename($0);
Packit d0f5c2
Packit d0f5c2
use Getopt::Long qw(:config no_ignore_case);
Packit d0f5c2
Packit d0f5c2
my %Opt;
Packit d0f5c2
Packit d0f5c2
help()
Packit d0f5c2
    unless
Packit d0f5c2
      GetOptions(\%Opt,
Packit d0f5c2
         'from|f=s',
Packit d0f5c2
         'to|t=s',
Packit d0f5c2
         'list|l',
Packit d0f5c2
         'string|s=s',
Packit d0f5c2
         'check|C=i',
Packit d0f5c2
         'c',
Packit d0f5c2
         'perlqq|p',
Packit d0f5c2
         'htmlcref',
Packit d0f5c2
         'xmlcref',
Packit d0f5c2
         'debug|D',
Packit d0f5c2
         'scheme|S=s',
Packit d0f5c2
         'resolve|r=s',
Packit d0f5c2
         'help',
Packit d0f5c2
         );
Packit d0f5c2
Packit d0f5c2
$Opt{help} and help();
Packit d0f5c2
$Opt{list} and list_encodings();
Packit d0f5c2
my $locale = $ENV{LC_CTYPE} || $ENV{LC_ALL} || $ENV{LANG};
Packit d0f5c2
defined $Opt{resolve} and resolve_encoding($Opt{resolve});
Packit d0f5c2
$Opt{from} || $Opt{to} || help();
Packit d0f5c2
my $from = $Opt{from} || $locale or help("from_encoding unspecified");
Packit d0f5c2
my $to   = $Opt{to}   || $locale or help("to_encoding unspecified");
Packit d0f5c2
$Opt{string} and Encode::from_to($Opt{string}, $from, $to) and print $Opt{string} and exit;
Packit d0f5c2
my $scheme = do {
Packit d0f5c2
    if (defined $Opt{scheme}) {
Packit d0f5c2
	if (!exists $Scheme{$Opt{scheme}}) {
Packit d0f5c2
	    warn "Unknown scheme '$Opt{scheme}', fallback to 'from_to'.\n";
Packit d0f5c2
	    'from_to';
Packit d0f5c2
	} else {
Packit d0f5c2
	    $Opt{scheme};
Packit d0f5c2
	}
Packit d0f5c2
    } else {
Packit d0f5c2
	'from_to';
Packit d0f5c2
    }
Packit d0f5c2
};
Packit d0f5c2
Packit d0f5c2
$Opt{check} ||= $Opt{c};
Packit d0f5c2
$Opt{perlqq}   and $Opt{check} = Encode::PERLQQ;
Packit d0f5c2
$Opt{htmlcref} and $Opt{check} = Encode::HTMLCREF;
Packit d0f5c2
$Opt{xmlcref}  and $Opt{check} = Encode::XMLCREF;
Packit d0f5c2
Packit d0f5c2
my $efrom = Encode->getEncoding($from) || die "Unknown encoding '$from'";
Packit d0f5c2
my $eto   = Encode->getEncoding($to)   || die "Unknown encoding '$to'";
Packit d0f5c2
Packit d0f5c2
my $cfrom = $efrom->name;
Packit d0f5c2
my $cto   = $eto->name;
Packit d0f5c2
Packit d0f5c2
if ($Opt{debug}){
Packit d0f5c2
    print <<"EOT";
Packit d0f5c2
Scheme: $scheme
Packit d0f5c2
From:   $from => $cfrom
Packit d0f5c2
To:     $to => $cto
Packit d0f5c2
EOT
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
my %use_bom =
Packit d0f5c2
  map { $_ => 1 } qw/UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE/;
Packit d0f5c2
Packit d0f5c2
# we do not use <> (or ARGV) for the sake of binmode()
Packit d0f5c2
@ARGV or push @ARGV, \*STDIN;
Packit d0f5c2
Packit d0f5c2
unless ( $scheme eq 'perlio' ) {
Packit d0f5c2
    binmode STDOUT;
Packit d0f5c2
    my $need2slurp = $use_bom{ $eto } || $use_bom{ $efrom };
Packit d0f5c2
    for my $argv (@ARGV) {
Packit d0f5c2
        my $ifh = ref $argv ? $argv : undef;
Packit d0f5c2
	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
Packit d0f5c2
        $ifh or open $ifh, "<", $argv or next;
Packit d0f5c2
        binmode $ifh;
Packit d0f5c2
        if ( $scheme eq 'from_to' ) {    # default
Packit d0f5c2
	    if ($need2slurp){
Packit d0f5c2
		local $/;
Packit d0f5c2
		$_ = <$ifh>;
Packit d0f5c2
		Encode::from_to( $_, $from, $to, $Opt{check} );
Packit d0f5c2
		print;
Packit d0f5c2
	    }else{
Packit d0f5c2
		while (<$ifh>) {
Packit d0f5c2
		    Encode::from_to( $_, $from, $to, $Opt{check} );
Packit d0f5c2
		    print;
Packit d0f5c2
		}
Packit d0f5c2
	    }
Packit d0f5c2
        }
Packit d0f5c2
        elsif ( $scheme eq 'decode_encode' ) {    # step-by-step
Packit d0f5c2
	    if ($need2slurp){
Packit d0f5c2
		local $/;
Packit d0f5c2
		$_ = <$ifh>;
Packit d0f5c2
                my $decoded = decode( $from, $_, $Opt{check} );
Packit d0f5c2
                my $encoded = encode( $to, $decoded );
Packit d0f5c2
                print $encoded;
Packit d0f5c2
	    }else{
Packit d0f5c2
		while (<$ifh>) {
Packit d0f5c2
		    my $decoded = decode( $from, $_, $Opt{check} );
Packit d0f5c2
		    my $encoded = encode( $to, $decoded );
Packit d0f5c2
		    print $encoded;
Packit d0f5c2
		}
Packit d0f5c2
	    }
Packit d0f5c2
	}
Packit d0f5c2
	else {                                    # won't reach
Packit d0f5c2
            die "$name: unknown scheme: $scheme";
Packit d0f5c2
        }
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
else {
Packit d0f5c2
Packit d0f5c2
    # NI-S favorite
Packit d0f5c2
    binmode STDOUT => "raw:encoding($to)";
Packit d0f5c2
    for my $argv (@ARGV) {
Packit d0f5c2
        my $ifh = ref $argv ? $argv : undef;
Packit d0f5c2
	$ifh or open $ifh, "<", $argv or warn "Can't open $argv: $!" and next;
Packit d0f5c2
        $ifh or open $ifh, "<", $argv or next;
Packit d0f5c2
        binmode $ifh => "raw:encoding($from)";
Packit d0f5c2
        print while (<$ifh>);
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub list_encodings {
Packit d0f5c2
    print join( "\n", Encode->encodings(":all") ), "\n";
Packit d0f5c2
    exit 0;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub resolve_encoding {
Packit d0f5c2
    if ( my $alias = Encode::resolve_alias( $_[0] ) ) {
Packit d0f5c2
        print $alias, "\n";
Packit d0f5c2
        exit 0;
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        warn "$name: $_[0] is not known to Encode\n";
Packit d0f5c2
        exit 1;
Packit d0f5c2
    }
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub help {
Packit d0f5c2
    my $message = shift;
Packit d0f5c2
    $message and print STDERR "$name error: $message\n";
Packit d0f5c2
    print STDERR <<"EOT";
Packit d0f5c2
$name [-f from_encoding] [-t to_encoding]
Packit d0f5c2
      [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
Packit d0f5c2
      [-s string|file...]
Packit d0f5c2
$name -l
Packit d0f5c2
$name -r encoding_alias
Packit d0f5c2
$name -h
Packit d0f5c2
Common options:
Packit d0f5c2
  -l,--list
Packit d0f5c2
     lists all available encodings
Packit d0f5c2
  -r,--resolve encoding_alias
Packit d0f5c2
    resolve encoding to its (Encode) canonical name
Packit d0f5c2
  -f,--from from_encoding  
Packit d0f5c2
     when omitted, the current locale will be used
Packit d0f5c2
  -t,--to to_encoding    
Packit d0f5c2
     when omitted, the current locale will be used
Packit d0f5c2
  -s,--string string         
Packit d0f5c2
     "string" will be the input instead of STDIN or files
Packit d0f5c2
The following are mainly of interest to Encode hackers:
Packit d0f5c2
  -C N | -c           check the validity of the input
Packit d0f5c2
  -D,--debug          show debug information
Packit d0f5c2
  -S,--scheme scheme  use the scheme for conversion
Packit d0f5c2
Those are handy when you can only see ASCII characters:
Packit d0f5c2
  -p,--perlqq         transliterate characters missing in encoding to \\x{HHHH}
Packit d0f5c2
                      where HHHH is the hexadecimal Unicode code point
Packit d0f5c2
  --htmlcref          transliterate characters missing in encoding to &#NNN;
Packit d0f5c2
                      where NNN is the decimal Unicode code point
Packit d0f5c2
  --xmlcref           transliterate characters missing in encoding to &#xHHHH;
Packit d0f5c2
                      where HHHH is the hexadecimal Unicode code point
Packit d0f5c2
Packit d0f5c2
EOT
Packit d0f5c2
    exit;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
__END__
Packit d0f5c2
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
piconv -- iconv(1), reinvented in perl
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  piconv [-f from_encoding] [-t to_encoding]
Packit d0f5c2
         [-p|--perlqq|--htmlcref|--xmlcref] [-C N|-c] [-D] [-S scheme]
Packit d0f5c2
         [-s string|file...]
Packit d0f5c2
  piconv -l
Packit d0f5c2
  piconv -r encoding_alias
Packit d0f5c2
  piconv -h
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
B<piconv> is perl version of B<iconv>, a character encoding converter
Packit d0f5c2
widely available for various Unixen today.  This script was primarily
Packit d0f5c2
a technology demonstrator for Perl 5.8.0, but you can use piconv in the
Packit d0f5c2
place of iconv for virtually any case.
Packit d0f5c2
Packit d0f5c2
piconv converts the character encoding of either STDIN or files
Packit d0f5c2
specified in the argument and prints out to STDOUT.
Packit d0f5c2
Packit d0f5c2
Here is the list of options.  Some options can be in short format (-f)
Packit d0f5c2
or long (--from) one.
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item -f,--from I<from_encoding>
Packit d0f5c2
Packit d0f5c2
Specifies the encoding you are converting from.  Unlike B<iconv>,
Packit d0f5c2
this option can be omitted.  In such cases, the current locale is used.
Packit d0f5c2
Packit d0f5c2
=item -t,--to I<to_encoding>
Packit d0f5c2
Packit d0f5c2
Specifies the encoding you are converting to.  Unlike B<iconv>,
Packit d0f5c2
this option can be omitted.  In such cases, the current locale is used.
Packit d0f5c2
Packit d0f5c2
Therefore, when both -f and -t are omitted, B<piconv> just acts
Packit d0f5c2
like B<cat>.
Packit d0f5c2
Packit d0f5c2
=item -s,--string I<string>
Packit d0f5c2
Packit d0f5c2
uses I<string> instead of file for the source of text.
Packit d0f5c2
Packit d0f5c2
=item -l,--list
Packit d0f5c2
Packit d0f5c2
Lists all available encodings, one per line, in case-insensitive
Packit d0f5c2
order.  Note that only the canonical names are listed; many aliases
Packit d0f5c2
exist.  For example, the names are case-insensitive, and many standard
Packit d0f5c2
and common aliases work, such as "latin1" for "ISO-8859-1", or "ibm850"
Packit d0f5c2
instead of "cp850", or "winlatin1" for "cp1252".  See L<Encode::Supported>
Packit d0f5c2
for a full discussion.
Packit d0f5c2
Packit d0f5c2
=item -r,--resolve I<encoding_alias>
Packit d0f5c2
Packit d0f5c2
Resolve I<encoding_alias> to Encode canonical encoding name.
Packit d0f5c2
Packit d0f5c2
=item -C,--check I<N>
Packit d0f5c2
Packit d0f5c2
Check the validity of the stream if I<N> = 1.  When I<N> = -1, something
Packit d0f5c2
interesting happens when it encounters an invalid character.
Packit d0f5c2
Packit d0f5c2
=item -c
Packit d0f5c2
Packit d0f5c2
Same as C<-C 1>.
Packit d0f5c2
Packit d0f5c2
=item -p,--perlqq
Packit d0f5c2
Packit d0f5c2
Transliterate characters missing in encoding to \x{HHHH} where HHHH is the
Packit d0f5c2
hexadecimal Unicode code point.
Packit d0f5c2
Packit d0f5c2
=item --htmlcref
Packit d0f5c2
Packit d0f5c2
Transliterate characters missing in encoding to &#NNN; where NNN is the
Packit d0f5c2
decimal Unicode code point.
Packit d0f5c2
Packit d0f5c2
=item --xmlcref
Packit d0f5c2
Packit d0f5c2
Transliterate characters missing in encoding to &#xHHHH; where HHHH is the
Packit d0f5c2
hexadecimal Unicode code point.
Packit d0f5c2
Packit d0f5c2
=item -h,--help
Packit d0f5c2
Packit d0f5c2
Show usage.
Packit d0f5c2
Packit d0f5c2
=item -D,--debug
Packit d0f5c2
Packit d0f5c2
Invokes debugging mode.  Primarily for Encode hackers.
Packit d0f5c2
Packit d0f5c2
=item -S,--scheme I<scheme>
Packit d0f5c2
Packit d0f5c2
Selects which scheme is to be used for conversion.  Available schemes
Packit d0f5c2
are as follows:
Packit d0f5c2
Packit d0f5c2
=over 4
Packit d0f5c2
Packit d0f5c2
=item from_to
Packit d0f5c2
Packit d0f5c2
Uses Encode::from_to for conversion.  This is the default.
Packit d0f5c2
Packit d0f5c2
=item decode_encode
Packit d0f5c2
Packit d0f5c2
Input strings are decode()d then encode()d.  A straight two-step
Packit d0f5c2
implementation.
Packit d0f5c2
Packit d0f5c2
=item perlio
Packit d0f5c2
Packit d0f5c2
The new perlIO layer is used.  NI-S' favorite.
Packit d0f5c2
Packit d0f5c2
You should use this option if you are using UTF-16 and others which
Packit d0f5c2
linefeed is not $/.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
Like the I<-D> option, this is also for Encode hackers.
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<iconv(1)>
Packit d0f5c2
L<locale(3)>
Packit d0f5c2
L<Encode>
Packit d0f5c2
L<Encode::Supported>
Packit d0f5c2
L<Encode::Alias>
Packit d0f5c2
L<PerlIO>
Packit d0f5c2
Packit d0f5c2
=cut