Blame internal/unicode

Packit 95306a
#!/usr/bin/perl
Packit 95306a
Packit 95306a
# http://perlmonks.org/?node_id=1084067
Packit 95306a
# Script by Jim
Packit 95306a
#use v5.10;
Packit 95306a
use strict;
Packit 95306a
use warnings;
Packit 95306a
use utf8;
Packit 95306a
Packit 95306a
use Encode qw( encode_utf8 );
Packit 95306a
use Unicode::UCD qw( charblock );
Packit 95306a
Packit 95306a
binmode STDOUT, ':encoding(UTF-8)';
Packit 95306a
Packit 95306a
while (my $word = <DATA>) {
Packit 95306a
    chomp $word;
Packit 95306a
Packit 95306a
    my $length_in_bytes       = length_in_bytes($word);
Packit 95306a
    my $length_in_code_points = length_in_code_points($word);
Packit 95306a
    my $length_in_graphemes   = length_in_graphemes($word);
Packit 95306a
    my $code_points_in_blocks = code_points_in_blocks($word);
Packit 95306a
Packit 95306a
    printf "%-12s | Bytes: %2d | Code Points: %2d | Graphemes: %2d | Blocks: %s\n",
Packit 95306a
        $word,
Packit 95306a
        $length_in_bytes,
Packit 95306a
        $length_in_code_points,
Packit 95306a
        $length_in_graphemes,
Packit 95306a
        $code_points_in_blocks;
Packit 95306a
}
Packit 95306a
Packit 95306a
exit 0;
Packit 95306a
Packit 95306a
sub length_in_bytes {
Packit 95306a
    my $word = shift;
Packit 95306a
Packit 95306a
    my $length = length encode_utf8($word);
Packit 95306a
Packit 95306a
    return $length;
Packit 95306a
}
Packit 95306a
Packit 95306a
sub length_in_code_points {
Packit 95306a
    my $word = shift;
Packit 95306a
Packit 95306a
    my $length = length $word;
Packit 95306a
Packit 95306a
    return $length;
Packit 95306a
}
Packit 95306a
Packit 95306a
sub length_in_graphemes {
Packit 95306a
    my $word = shift;
Packit 95306a
Packit 95306a
    my $length = () = $word =~ m/\X/g;
Packit 95306a
Packit 95306a
    return $length;
Packit 95306a
}
Packit 95306a
Packit 95306a
sub code_points_in_blocks {
Packit 95306a
    my $word = shift;
Packit 95306a
Packit 95306a
    my %total_code_points_by;
Packit 95306a
    my $blocks = '';
Packit 95306a
Packit 95306a
    for my $character (split m//, $word) {
Packit 95306a
        my $block = charblock(ord $character);
Packit 95306a
Packit 95306a
        $total_code_points_by{$block}++;
Packit 95306a
    }
Packit 95306a
Packit 95306a
    for my $block (sort keys %total_code_points_by) {
Packit 95306a
        my $total = $total_code_points_by{$block};
Packit 95306a
Packit 95306a
        $blocks .= sprintf "%s%s (%d)",
Packit 95306a
                   (length $blocks ? ', ' : ''), $block, $total;
Packit 95306a
    }
Packit 95306a
Packit 95306a
    return $blocks;
Packit 95306a
}
Packit 95306a
Packit 95306a
__DATA__
Packit 95306a
æ
Packit 95306a
æð
Packit 95306a
æða
Packit 95306a
æðaber
Packit 95306a
æðahnútur
Packit 95306a
æðakölkun
Packit 95306a
æðardúnn
Packit 95306a
æðarfugl
Packit 95306a
æðarkolla
Packit 95306a
æðarkóngur
Packit 95306a
æðarvarp
Packit 95306a
æði
Packit 95306a
æðimargur
Packit 95306a
æðisgenginn
Packit 95306a
æðiskast
Packit 95306a
æðislegur
Packit 95306a
æðrast
Packit 95306a
æðri
Packit 95306a
æðrulaus
Packit 95306a
æðruleysi
Packit 95306a
æðruorð
Packit 95306a
æðrutónn
Packit 95306a
æðstur
Packit 95306a
æður
Packit 95306a
æfa