Blame bin/encguess

Packit d0f5c2
#!./perl
Packit d0f5c2
use 5.008001;
Packit d0f5c2
BEGIN { pop @INC if $INC[-1] eq '.' }
Packit d0f5c2
use strict;
Packit d0f5c2
use warnings;
Packit d0f5c2
use Encode;
Packit d0f5c2
use Getopt::Std;
Packit d0f5c2
use Carp;
Packit d0f5c2
use Encode::Guess;
Packit d0f5c2
$Getopt::Std::STANDARD_HELP_VERSION = 1;
Packit d0f5c2
Packit d0f5c2
my %opt;
Packit d0f5c2
getopts( "huSs:", \%opt );
Packit d0f5c2
my @suspect_list;
Packit d0f5c2
list_valid_suspects() and exit if $opt{S};
Packit d0f5c2
@suspect_list = split /:,/, $opt{s} if $opt{s};
Packit d0f5c2
HELP_MESSAGE() if $opt{h};
Packit d0f5c2
HELP_MESSAGE() unless @ARGV;
Packit d0f5c2
do_guess($_) for @ARGV;
Packit d0f5c2
Packit d0f5c2
sub read_file {
Packit d0f5c2
    my $filename = shift;
Packit d0f5c2
    local $/;
Packit d0f5c2
    open my $fh, '<:raw', $filename or croak "$filename:$!";
Packit d0f5c2
    my $content = <$fh>;
Packit d0f5c2
    close $fh;
Packit d0f5c2
    return $content;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub do_guess {
Packit d0f5c2
    my $filename = shift;
Packit d0f5c2
    my $data     = read_file($filename);
Packit d0f5c2
    my $enc      = guess_encoding( $data, @suspect_list );
Packit d0f5c2
    if ( !ref($enc) && $opt{u} ) {
Packit d0f5c2
        return 1;
Packit d0f5c2
    }
Packit d0f5c2
    print "$filename\t";
Packit d0f5c2
    if ( ref($enc) ) {
Packit d0f5c2
        print $enc->mime_name();
Packit d0f5c2
    }
Packit d0f5c2
    else {
Packit d0f5c2
        print "unknown";
Packit d0f5c2
    }
Packit d0f5c2
    print "\n";
Packit d0f5c2
    return 1;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub list_valid_suspects {
Packit d0f5c2
    print join( "\n", Encode->encodings(":all") );
Packit d0f5c2
    print "\n";
Packit d0f5c2
    return 1;
Packit d0f5c2
}
Packit d0f5c2
Packit d0f5c2
sub HELP_MESSAGE {
Packit d0f5c2
    exec 'pod2usage', $0 or die "pod2usage: $!" 
Packit d0f5c2
}
Packit d0f5c2
__END__
Packit d0f5c2
=head1 NAME
Packit d0f5c2
Packit d0f5c2
encguess - guess character encodings of files
Packit d0f5c2
Packit d0f5c2
=head1 VERSION
Packit d0f5c2
Packit d0f5c2
$Id: encguess,v 0.2 2016/08/04 03:15:58 dankogai Exp $
Packit d0f5c2
Packit d0f5c2
=head1 SYNOPSIS
Packit d0f5c2
Packit d0f5c2
  encguess [switches] filename...
Packit d0f5c2
Packit d0f5c2
=head2 SWITCHES
Packit d0f5c2
Packit d0f5c2
=over 2
Packit d0f5c2
Packit d0f5c2
=item -h
Packit d0f5c2
Packit d0f5c2
show this message and exit.
Packit d0f5c2
Packit d0f5c2
=item -s
Packit d0f5c2
Packit d0f5c2
specify a list of "suspect encoding types" to test, 
Packit d0f5c2
seperated by either C<:> or C<,>
Packit d0f5c2
Packit d0f5c2
=item -S
Packit d0f5c2
Packit d0f5c2
output a list of all acceptable encoding types that can be used with
Packit d0f5c2
the -s param
Packit d0f5c2
Packit d0f5c2
=item -u
Packit d0f5c2
Packit d0f5c2
suppress display of unidentified types
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head2 EXAMPLES:
Packit d0f5c2
Packit d0f5c2
=over 2
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Guess encoding of a file named C<test.txt>, using only the default
Packit d0f5c2
suspect types.
Packit d0f5c2
Packit d0f5c2
   encguess test.txt
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Guess the encoding type of a file named C<test.txt>, using the suspect
Packit d0f5c2
types C<euc-jp,shiftjis,7bit-jis>.
Packit d0f5c2
Packit d0f5c2
   encguess -s euc-jp,shiftjis,7bit-jis test.txt
Packit d0f5c2
   encguess -s euc-jp:shiftjis:7bit-jis test.txt
Packit d0f5c2
Packit d0f5c2
=item *
Packit d0f5c2
Packit d0f5c2
Guess the encoding type of several files, do not display results for
Packit d0f5c2
unidentified files.
Packit d0f5c2
Packit d0f5c2
   encguess -us euc-jp,shiftjis,7bit-jis test*.txt
Packit d0f5c2
Packit d0f5c2
=back
Packit d0f5c2
Packit d0f5c2
=head1 DESCRIPTION
Packit d0f5c2
Packit d0f5c2
The encoding identification is done by checking one encoding type at a
Packit d0f5c2
time until all but the right type are eliminated. The set of encoding
Packit d0f5c2
types to try is defined by the -s parameter and defaults to ascii,
Packit d0f5c2
utf8 and UTF-16/32 with BOM. This can be overridden by passing one or
Packit d0f5c2
more encoding types via the -s parameter. If you need to pass in
Packit d0f5c2
multiple suspect encoding types, use a quoted string with the a space
Packit d0f5c2
separating each value.
Packit d0f5c2
Packit d0f5c2
=head1 SEE ALSO
Packit d0f5c2
Packit d0f5c2
L<Encode::Guess>, L<Encode::Detect>
Packit d0f5c2
Packit d0f5c2
=head1 LICENSE AND COPYRIGHT
Packit d0f5c2
Packit d0f5c2
Copyright 2015 Michael LaGrasta and Dan Kogai.
Packit d0f5c2
Packit d0f5c2
This program is free software; you can redistribute it and/or modify it
Packit d0f5c2
under the terms of the the Artistic License (2.0). You may obtain a
Packit d0f5c2
copy of the full license at:
Packit d0f5c2
Packit d0f5c2
L<http://www.perlfoundation.org/artistic_license_2_0>
Packit d0f5c2
Packit d0f5c2
=cut