Blame convmv

Packit 244994
#!/usr/bin/perl
Packit 244994
# convmv 2.01 - converts filenames from one encoding to another
Packit 244994
# Copyright © 2003-2017 Bjoern JACKE <bjoern@j3e.de>
Packit 244994
#
Packit 244994
# This program comes with ABSOLUTELY NO WARRANTY; it may be copied or modified
Packit 244994
# under the terms of the GNU General Public License version 2 or 3 as
Packit 244994
# published by the Free Software Foundation.
Packit 244994
Packit 244994
# to get a man page:
Packit 244994
# pod2man --section 1 --center=" " convmv | gzip > convmv.1.gz
Packit 244994
Packit 244994
Packit 244994
=head1 NAME
Packit 244994
Packit 244994
convmv - converts filenames from one encoding to another
Packit 244994
Packit 244994
=head1 SYNOPSIS
Packit 244994
Packit 244994
B<convmv> [B<options>] FILE(S) ... DIRECTORY(S)
Packit 244994
Packit 244994
=head1 OPTIONS
Packit 244994
Packit 244994
=over 4
Packit 244994
Packit 244994
=item B<-f ENCODING>
Packit 244994
Packit 244994
specify the current encoding of the filename(s) from which should be converted
Packit 244994
Packit 244994
=item B<-t ENCODING>
Packit 244994
Packit 244994
specify the encoding to which the filename(s) should be converted
Packit 244994
Packit 244994
=item B<-i>
Packit 244994
Packit 244994
interactive mode (ask y/n for each action)
Packit 244994
Packit 244994
=item B<-r>
Packit 244994
Packit 244994
recursively go through directories
Packit 244994
Packit 244994
=item B<--nfc>
Packit 244994
Packit 244994
target files will be normalization form C for UTF-8 (Linux etc.)
Packit 244994
Packit 244994
=item B<--nfd>
Packit 244994
Packit 244994
target files will be normalization form D for UTF-8 (OS X etc.).
Packit 244994
Packit 244994
=item B<--qfrom> , B<--qto>
Packit 244994
Packit 244994
be more quiet about the "from" or "to" of a rename (if it screws up your
Packit 244994
terminal e.g.). This will in fact do nothing else than replace any non-ASCII
Packit 244994
character (bytewise) with ? and any control character with * on printout, this
Packit 244994
does not affect rename operation itself.
Packit 244994
Packit 244994
=item B<--exec> command
Packit 244994
Packit 244994
execute the given command. You have to quote the command and #1 will be
Packit 244994
substituted by the old, #2 by the new filename. Using this option link 
Packit 244994
targets will stay untouched. Have in mind that #1 and #2 will be quoted
Packit 244994
by convmv already, you must not add extra quotation marks around them.
Packit 244994
Packit 244994
Example:
Packit 244994
Packit 244994
convmv -f latin1 -t utf-8 -r --exec "echo #1 should be renamed to #2" path/to/files
Packit 244994
Packit 244994
=item B<--list>
Packit 244994
Packit 244994
list all available encodings. To get support for more Chinese or Japanese
Packit 244994
encodings install the Perl HanExtra or JIS2K Encode packages.
Packit 244994
Packit 244994
=item B<--lowmem>
Packit 244994
Packit 244994
keep memory footprint low by not creating a hash of all files. This disables
Packit 244994
checking if symlink targets are in subtree. Symlink target pointers will be
Packit 244994
converted regardlessly. If you convert multiple hundredthousands or millions of
Packit 244994
files the memory usage of convmv might grow quite high. This option would help
Packit 244994
you out in that case.
Packit 244994
Packit 244994
=item B<--nosmart>
Packit 244994
Packit 244994
by default convmv will detect if a filename is already UTF8 encoded and will
Packit 244994
skip this file if conversion from some charset to UTF8 should be performed.
Packit 244994
C<--nosmart> will also force conversion to UTF-8 for such files, which might
Packit 244994
result in "double encoded UTF-8" (see section below).
Packit 244994
Packit 244994
=item B<--fixdouble>
Packit 244994
Packit 244994
using the C<--fixdouble> option convmv does only convert files which will still
Packit 244994
be UTF-8 encoded after conversion. That's useful for fixing double-encoded
Packit 244994
UTF-8 files. All files which are not UTF-8 or will not result in UTF-8 after
Packit 244994
conversion will not be touched. Also see chapter "How to undo double UTF-8 ..."
Packit 244994
below.
Packit 244994
Packit 244994
=item B<--notest>
Packit 244994
Packit 244994
Needed to actually rename the files. By default convmv will just print what it
Packit 244994
wants to do.
Packit 244994
Packit 244994
=item B<--parsable>
Packit 244994
Packit 244994
This is an advanced option that people who want to write a GUI front end will
Packit 244994
find useful (some others maybe, too). It will convmv make print out what it
Packit 244994
would do in an easy parsable way. The first column contains the action or some
Packit 244994
kind of information, the second column mostly contains the file that is to be
Packit 244994
modified and if appropriate the third column contains the modified value.  Each
Packit 244994
column is separated by \0\n (nullbyte newline). Each row (one action) is
Packit 244994
separated by \0\0\n (nullbyte nullbyte newline).
Packit 244994
Packit 244994
=item B<--no-preserve-mtimes>
Packit 244994
Packit 244994
modifying filenames usually causes the parent directory's mtime being updated.
Packit 244994
Since version 2 convmv by default resets the mtime to the old value. If your
Packit 244994
filesystem supports sub-second resolution the sub-second part of the atime and
Packit 244994
mtime will be lost as Perl does not yet support that. With this option you can
Packit 244994
B<disable> the preservation of the mtimes.
Packit 244994
Packit 244994
=item B<--replace>
Packit 244994
Packit 244994
if the file to which shall be renamed already exists, it will be overwritten if
Packit 244994
the other file content is equal.
Packit 244994
Packit 244994
=item B<--unescape>
Packit 244994
Packit 244994
this option will remove this ugly % hex sequences from filenames and turn them
Packit 244994
into (hopefully) nicer 8-bit characters. After --unescape you might want to do
Packit 244994
a charset conversion. This sequences like %20 etc. are sometimes produced when
Packit 244994
downloading via http or ftp.
Packit 244994
Packit 244994
=item B<--upper> , B<--lower>
Packit 244994
Packit 244994
turn filenames into all upper or all lower case. When the file is not
Packit 244994
ASCII-encoded, convmv expects a charset to be entered via the -f switch.
Packit 244994
Packit 244994
=item B<--map=>some-extra-mapping
Packit 244994
Packit 244994
apply some custom character mappings, currently supported are:
Packit 244994
Packit 244994
ntfs-sfm(-undo), ntfs-sfu(-undo) for the mapping of illegal ntfs characters for
Packit 244994
Linux or Macintosh cifs clients (see MS KB 117258 also mapchars mount option of
Packit 244994
mount.cifs on Linux).
Packit 244994
Packit 244994
ntfs-pretty(-undo) for for the mapping of illegal ntfs characters to pretty
Packit 244994
legal Japanese versions of them.
Packit 244994
Packit 244994
See the map_get_newname() function how to easily add own mappings if needed.
Packit 244994
Let me know if you think convmv is missing some useful mapping here.
Packit 244994
Packit 244994
=item B<--dotlessi>
Packit 244994
Packit 244994
care about the dotless i/I issue. A lowercase version of "I" will also be
Packit 244994
dotless while an uppercase version of "i" will also be dotted. This is an
Packit 244994
issue for Turkish and Azeri.
Packit 244994
Packit 244994
By the way: The superscript dot of the letter i was added in the Middle Ages to
Packit 244994
distinguish the letter (in manuscripts) from adjacent vertical strokes in such
Packit 244994
letters as u, m, and n. J is a variant form of i which emerged at this time and
Packit 244994
subsequently became a separate letter.
Packit 244994
Packit 244994
=item B<--help>
Packit 244994
Packit 244994
print a short summary of available options
Packit 244994
Packit 244994
=item B<--dump-options>
Packit 244994
Packit 244994
print a list of all available options
Packit 244994
Packit 244994
=back
Packit 244994
Packit 244994
=head1 DESCRIPTION
Packit 244994
Packit 244994
B<convmv> is meant to help convert a single filename, a directory tree and the
Packit 244994
contained files or a whole filesystem into a different encoding. It just
Packit 244994
converts the filenames, not the content of the files. A special feature of
Packit 244994
convmv is that it also takes care of symlinks, also converts the symlink target
Packit 244994
pointer in case the symlink target is being converted, too.
Packit 244994
Packit 244994
All this comes in very handy when one wants to switch over from old 8-bit
Packit 244994
locales to UTF-8 locales. It is also possible to convert directories to UTF-8
Packit 244994
which are already partly UTF-8 encoded. convmv is able to detect if certain
Packit 244994
files are UTF-8 encoded and will skip them by default. To turn this smartness
Packit 244994
off use the C<--nosmart> switch.
Packit 244994
Packit 244994
=head2 Filesystem issues
Packit 244994
Packit 244994
Almost all POSIX filesystems do not care about how filenames are encoded, here
Packit 244994
are some exceptions:
Packit 244994
Packit 244994
=head3 HFS+ on OS X / Darwin
Packit 244994
Packit 244994
Linux and (most?) other Unix-like operating systems use the so called
Packit 244994
normalization form C (NFC) for its UTF-8 encoding by default but do not enforce
Packit 244994
this.  Darwin, the base of the Macintosh OS enforces normalization form D
Packit 244994
(NFD), where a few characters are encoded in a different way. On OS X it's not
Packit 244994
possible to create NFC UTF-8 filenames because this is prevented at filesystem
Packit 244994
layer.  On HFS+ filenames are internally stored in UTF-16 and when converted
Packit 244994
back to UTF-8, for the underlying BSD system to be handable, NFD is created.
Packit 244994
See http://developer.apple.com/qa/qa2001/qa1173.html for defails. I think it
Packit 244994
was a very bad idea and breaks many things under OS X which expect a normal
Packit 244994
POSIX conforming system. Anywhere else convmv is able to convert files from NFC
Packit 244994
to NFD or vice versa which makes interoperability with such systems a lot
Packit 244994
easier.
Packit 244994
Packit 244994
=head3 JFS
Packit 244994
Packit 244994
If people mount JFS partitions with iocharset=utf8, there is a similar problem,
Packit 244994
because JFS is designed to store filenames internally in UTF-16, too; that is
Packit 244994
because Linux' JFS is really JFS2, which was a rewrite of JFS for OS/2. JFS
Packit 244994
partitions should always be mounted with iocharset=iso8859-1, which is also the
Packit 244994
default with recent 2.6.6 kernels. If this is not done, JFS does not behave
Packit 244994
like a POSIX filesystem and it might happen that certain files cannot be
Packit 244994
created at all, for example filenames in ISO-8859-1 encoding. Only when
Packit 244994
interoperation with OS/2 is needed iocharset should be set according to your
Packit 244994
used locale charmap.
Packit 244994
Packit 244994
=head3 NFS4
Packit 244994
Packit 244994
Despite other POSIX filesystems RFC3530 (NFS 4) mandates UTF-8 but also says:
Packit 244994
"The nfs4_cs_prep profile does not specify a normalization form.  A later
Packit 244994
revision of this specification may specify a particular normalization form." In
Packit 244994
other words, if you want to use NFS4 you might find the conversion and
Packit 244994
normalization features of convmv quite useful.
Packit 244994
Packit 244994
=head3 FAT/VFAT and NTFS
Packit 244994
Packit 244994
NTFS and VFAT (for long filenames) use UTF-16 internally to store filenames.
Packit 244994
You should not need to convert filenames if you mount one of those filesystems.
Packit 244994
Use appropriate mount options instead!
Packit 244994
Packit 244994
=head2 How to undo double UTF-8 (or other) encoded filenames
Packit 244994
Packit 244994
Sometimes it might happen that you "double-encoded" certain filenames, for
Packit 244994
example the file names already were UTF-8 encoded and you accidently did
Packit 244994
another conversion from some charset to UTF-8. You can simply undo that by
Packit 244994
converting that the other way round. The from-charset has to be UTF-8 and the
Packit 244994
to-charset has to be the from-charset you previously accidently used.  If you
Packit 244994
use the C<--fixdouble> option convmv will make sure that only files will be
Packit 244994
processed that will still be UTF-8 encoded after conversion and it will leave
Packit 244994
non-UTF-8 files untouched. You should check to get the correct results by doing
Packit 244994
the conversion without C<--notest> before, also the C<--qfrom> option might be
Packit 244994
helpful, because the double utf-8 file names might screw up your terminal if
Packit 244994
they are being printed - they often contain control sequences which do funny
Packit 244994
things with your terminal window. If you are not sure about the charset which
Packit 244994
was accidently converted from, using C<--qfrom> is a good way to fiddle out the
Packit 244994
required encoding without destroying the file names finally.
Packit 244994
Packit 244994
=head2 How to repair Samba files
Packit 244994
Packit 244994
When in the smb.conf (of Samba 2.x) there hasn't been set a correct "character
Packit 244994
set" variable, files which are created from Win* clients are being created in
Packit 244994
the client's codepage, e.g. cp850 for western european languages. As a result
Packit 244994
of that the files which contain non-ASCII characters are screwed up if you "ls"
Packit 244994
them on the Unix server. If you change the "character set" variable afterwards
Packit 244994
to iso8859-1, newly created files are okay, but the old files are still screwed
Packit 244994
up in the Windows encoding. In this case convmv can also be used to convert the
Packit 244994
old Samba-shared files from cp850 to iso8859-1.
Packit 244994
Packit 244994
By the way: Samba 3.x finally maps to UTF-8 filenames by default, so also when
Packit 244994
you migrate from Samba 2 to Samba 3 you might have to convert your file names.
Packit 244994
Packit 244994
=head2 Netatalk interoperability issues
Packit 244994
Packit 244994
When Netatalk is being switched to UTF-8 which is supported in version 2 then
Packit 244994
it is NOT sufficient to rename the file names. There needs to be done more. See
Packit 244994
http://netatalk.sourceforge.net/2.0/htmldocs/upgrade.html#volumes-and-filenames
Packit 244994
and the uniconv utility of Netatalk for details.
Packit 244994
Packit 244994
=head1 SEE ALSO
Packit 244994
Packit 244994
L<locale(1)> L<utf-8(7)> L<charsets(7)>
Packit 244994
Packit 244994
=head1 BUGS
Packit 244994
Packit 244994
no bugs or fleas known
Packit 244994
Packit 244994
=head1 DONATE
Packit 244994
Packit 244994
You can support convmv by doing a donation, see L<https://www.j3e.de/donate.html>
Packit 244994
Packit 244994
=head1 AUTHOR
Packit 244994
Packit 244994
Bjoern JACKE
Packit 244994
 
Packit 244994
Send mail to bjoern [at] j3e.de for bug reports and suggestions.
Packit 244994
Packit 244994
=cut
Packit 244994
Packit 244994
require 5.008;
Packit 244994
use Getopt::Long;
Packit 244994
use File::Find;
Packit 244994
use File::Basename;
Packit 244994
use Cwd;
Packit 244994
use Encode 'from_to','encode_utf8','decode_utf8','_utf8_on','_utf8_off';
Packit 244994
#use Encode 'is_utf8';
Packit 244994
use Unicode::Normalize;
Packit 244994
use utf8;
Packit 244994
use bytes;
Packit 244994
Packit 244994
Getopt::Long::Configure ("bundling");
Packit 244994
binmode STDOUT, ":bytes";
Packit 244994
binmode STDERR, ":bytes";
Packit 244994
Packit 244994
my $opt_mtimes = 1; # default 1 since convmv 2.0
Packit 244994
my %opts = (
Packit 244994
		'nfc'=>\$opt_nfc,
Packit 244994
		'nfd'=>\$opt_nfd,
Packit 244994
		'f=s'=>\$opt_f,
Packit 244994
		't=s'=>\$opt_t,
Packit 244994
		'r'=>\$opt_r,
Packit 244994
		'i'=>\$opt_i,
Packit 244994
		'list'=>\$opt_list,
Packit 244994
		'help'=>\$opt_help,
Packit 244994
		'notest'=>\$opt_notest,
Packit 244994
		'qfrom'=>\$opt_qfrom,
Packit 244994
		'qto'=>\$opt_qto,
Packit 244994
		'replace'=>\$opt_replace,
Packit 244994
		'nosmart'=>\$opt_nosmart,
Packit 244994
		'lowmem'=>\$opt_lowmem,
Packit 244994
		'exec=s'=>\$opt_exec,
Packit 244994
		'unescape'=>\$opt_unescape,
Packit 244994
		'upper'=>\$opt_upper,
Packit 244994
		'lower'=>\$opt_lower,
Packit 244994
		'dotlessi'=>\$opt_dotlessi,
Packit 244994
		'parsable'=>\$opt_parsable,
Packit 244994
		'fixdouble'=>\$opt_fixdouble,
Packit 244994
		'preserve-mtimes!'=>\$opt_mtimes,
Packit 244994
		'dump-options'=>\$opt_dumpoptions,
Packit 244994
		'undo-script=s'=>\$opt_undo_script,
Packit 244994
		'map=s'=>\$opt_map,
Packit 244994
		);
Packit 244994
GetOptions %opts or exit 1;
Packit 244994
use File::Compare;
Packit 244994
$errors_occurred=0;
Packit 244994
$warnings_occurred=0;
Packit 244994
$ops=0;
Packit 244994
$mytime = time();
Packit 244994
$maxfilenamelength=255;
Packit 244994
# $maxpathlength=4096; # this might be used somehow, somewhere?
Packit 244994
Packit 244994
%dir_time_hash=();
Packit 244994
my $this_is_valid_utf8;
Packit 244994
Packit 244994
# delimiter and final delimiter for parsable mode:
Packit 244994
$del = "\0\n";
Packit 244994
$fin_del = "\0\0\n";
Packit 244994
Packit 244994
&listvalidencodings and exit 0 if ($opt_list);
Packit 244994
&dumpoptions and exit 0 if ($opt_dumpoptions);
Packit 244994
&printusage and exit 1 if (!@ARGV or $opt_help);
Packit 244994
Packit 244994
&check_for_broken_perl_release();
Packit 244994
Packit 244994
if ($opt_parsable) {
Packit 244994
	if ($opt_notest or $opt_exec or $opt_i) {
Packit 244994
		die "--parsable mode cannot be used with --notest, --exec or -i\n";
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
if ($opt_replace and $opt_undo_script) {
Packit 244994
	die "--replace and --undo-script can't work together!\n";
Packit 244994
}
Packit 244994
Packit 244994
if ($opt_unescape) {
Packit 244994
	die "No charset conversion when unescaping!\n" if ($opt_f or $opt_t);
Packit 244994
	$checkenc=\&unescape_checkenc;
Packit 244994
	$get_newname=\&unescape_get_newname;
Packit 244994
} elsif ($opt_upper or $opt_lower) {
Packit 244994
	die "No charset conversion when uppering/lowering!\n" if ($opt_t);
Packit 244994
	die "Not possible to --upper and --lower at the same time!\n" if ($opt_upper and $opt_lower);
Packit 244994
	$checkenc=\&upperlower_checkenc;
Packit 244994
	$get_newname=\&upperlower_get_newname;
Packit 244994
	$opt_f="ascii" unless ($opt_f);
Packit 244994
} elsif ($opt_map) {
Packit 244994
	if ($opt_t or $opt_f or $opt_upper or $opt_lower or $opt_unescape) {
Packit 244994
		die "--map parameter not allowed with other character conversion parameters\n";
Packit 244994
	}
Packit 244994
	$checkenc=\&dummy;
Packit 244994
	$get_newname=\&map_get_newname;
Packit 244994
} else {
Packit 244994
	if (not ($opt_f and $opt_f=Encode::resolve_alias($opt_f))) {
Packit 244994
		die "wrong/unknown \"from\" encoding!\n";
Packit 244994
	}
Packit 244994
	if (not ($opt_t and $opt_t=Encode::resolve_alias($opt_t))) {
Packit 244994
		die "wrong/unknown \"to\" encoding!\n";
Packit 244994
	}
Packit 244994
	if ($opt_fixdouble) {
Packit 244994
		$checkenc=\&fixdouble_checkenc;
Packit 244994
	} else {
Packit 244994
		$checkenc=\&char_checkenc;
Packit 244994
	}
Packit 244994
	$get_newname=\&char_get_newname;
Packit 244994
}
Packit 244994
$to_is_utf8 = lc($opt_t) =~ m/^utf-?8/;
Packit 244994
$from_is_utf8 = lc($opt_f) =~ m/^utf-?8/;
Packit 244994
Packit 244994
if ($opt_qfrom) {
Packit 244994
	$from_print=\&to_ascii;
Packit 244994
} else {
Packit 244994
	$from_print=\&dummy;
Packit 244994
}
Packit 244994
Packit 244994
if ($opt_qto) {
Packit 244994
	$to_print=\&to_ascii;
Packit 244994
} else {
Packit 244994
	$to_print=\&dummy;
Packit 244994
}
Packit 244994
Packit 244994
if ($opt_nfc) {
Packit 244994
	$norm=\&NF;;
Packit 244994
	die "NFC requires UTF-8 as target charset\n" unless ($to_is_utf8);
Packit 244994
} elsif ($opt_nfd) {
Packit 244994
	$norm=\&NF;;
Packit 244994
	die "NFD requires UTF-8 as target charset\n" unless ($to_is_utf8);
Packit 244994
} else {
Packit 244994
	$norm=\&dummy;
Packit 244994
}
Packit 244994
Packit 244994
if ($opt_fixdouble) {
Packit 244994
	die "--fixdouble requires UTF-8 as source and non-UTF-8 as target charset\n" unless ($from_is_utf8 and $opt_t and not $to_is_utf8);
Packit 244994
}
Packit 244994
Packit 244994
$opt_lowmem=1 if ($opt_exec);
Packit 244994
Packit 244994
$pwd=cwd();
Packit 244994
@args=@ARGV;
Packit 244994
undef @ARGV;
Packit 244994
Packit 244994
for (@args) {
Packit 244994
	s/\/\.\//\/\//g; # normalize "/./" to "/"
Packit 244994
	s/\/[\/]+/\//g;  # normalize "//" to "/"
Packit 244994
	die "file or directory not found: $_\n" unless (-e or -l);
Packit 244994
}
Packit 244994
if ($opt_parsable) {
Packit 244994
	$outerr=NUL;
Packit 244994
} else {
Packit 244994
	$outerr=STDERR;
Packit 244994
}
Packit 244994
if ($opt_undo_script) {
Packit 244994
	die "undo-script file already exists, exiting.\n" if (-e $opt_undo_script);
Packit 244994
	open(UNDOLOG, ">", $opt_undo_script) or die "couldn't open undo-script for writing. Aborting.\n";
Packit 244994
	print UNDOLOG "# this is a per undo script generated by convmv.\n",
Packit 244994
			"# Please check if this looks reasonable before running!\n";
Packit 244994
	print UNDOLOG "# Example: perl $opt_undo_script\n";
Packit 244994
	print UNDOLOG "chdir $pwd;\n";
Packit 244994
}
Packit 244994
Packit 244994
## do {print ord($_)."_" for (split(//,$_));print "\n"; } for (@args); # debug print
Packit 244994
Packit 244994
print $outerr "Starting a dry run without changes...\n" unless ($opt_notest);
Packit 244994
Packit 244994
if ($opt_r) {
Packit 244994
	$myfind=\&fin;;
Packit 244994
} else {
Packit 244994
	$myfind=\&find0depth;
Packit 244994
}
Packit 244994
Packit 244994
&$myfind({wanted=>\&scan,bydepth=>1,no_chdir=>1}, @args);
Packit 244994
if (not $errors_occurred and $warnings_occurred) {
Packit 244994
	$errors_occurred=1 if (not &print_ask ("WARNINGS occurred. Do you really want to continue?",1));
Packit 244994
}
Packit 244994
Packit 244994
die "To prevent damage to your files, we won't continue.\nFirst fix this or correct options!\n" if ($errors_occurred);
Packit 244994
unless ($opt_exec) {
Packit 244994
	&$myfind({wanted=>\&process_symlink_targets,bydepth=>1,no_chdir=>1}, @args);
Packit 244994
}
Packit 244994
&$myfind({wanted=>\&process_main,bydepth=>1,no_chdir=>1}, @args);
Packit 244994
Packit 244994
# check for unintentionally left files
Packit 244994
#for (keys %dir_time_hash) {
Packit 244994
#	print $outerr "error: left in %dir_time_hash: $_\n";
Packit 244994
#}
Packit 244994
Packit 244994
$mytime = time() - $mytime;
Packit 244994
if ($opt_notest) {
Packit 244994
	print $outerr "Ready! I converted $ops files in $mytime seconds.\n",
Packit 244994
} else {
Packit 244994
	print $outerr "No changes to your files done. Would have converted $ops files in $mytime seconds.\nUse --notest to finally rename the files.\n";
Packit 244994
}
Packit 244994
Packit 244994
#####
Packit 244994
## subs
Packit 244994
###
Packit 244994
Packit 244994
# find-like function but without any depth search for not recursive mode:
Packit 244994
sub find0depth() {
Packit 244994
	my $opts = shift;
Packit 244994
	for (@_) {
Packit 244994
		$$opts{'wanted'}($_);
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
# scan for real files and check charset first:
Packit 244994
sub scan {
Packit 244994
	$arg = $_;
Packit 244994
	&get_dir_base_change;
Packit 244994
	if (-l $arg) {
Packit 244994
#		print "link: $arg in $dir\n";
Packit 244994
		if (not defined(&$checkenc($arg))) { $errors_occurred=1 };
Packit 244994
	} elsif (-d $arg) {
Packit 244994
#		print "dir: $arg in $dir\n";
Packit 244994
		$inod_fullname{(stat $arg)[1]}=$dir."/".$arg if (!$opt_lowmem);
Packit 244994
		if (not defined(&$checkenc($arg))) { $errors_occurred=1 };
Packit 244994
		if ($opt_r and not (-x $arg or -r $arg)) {
Packit 244994
			print $outerr "WARNING: cannot traverse ",&$from_print($dir."/".$arg),"\n";
Packit 244994
			$warnings_occurred=1;
Packit 244994
		}
Packit 244994
	} elsif (-f $arg) {
Packit 244994
#		print "file: $arg in $dir\n";
Packit 244994
		$inod_fullname{(stat $arg)[1]}=$dir."/".$arg if (!$opt_lowmem);
Packit 244994
		if (not defined(&$checkenc($arg))) { $errors_occurred=1 };
Packit 244994
	}
Packit 244994
	chdir $pwd;
Packit 244994
}
Packit 244994
Packit 244994
# move symlink targets:
Packit 244994
sub process_symlink_targets {
Packit 244994
	$arg=$_;
Packit 244994
	&get_dir_base_change;
Packit 244994
	if (-l $arg) {
Packit 244994
		$oldlink=readlink $arg;
Packit 244994
		if ((-f $oldlink or -d $oldlink) and $newname=&$get_newname($oldlink)) {
Packit 244994
			if ( $newname ne $oldlink ) {
Packit 244994
				if ( $inod_fullname{(stat $oldlink)[1]} or $opt_lowmem) { # = if (symlink target scanned before)
Packit 244994
					#print is_utf8($oldlink) ? 1 : 0;
Packit 244994
					#print is_utf8($newname) ? 1 : 0;
Packit 244994
					print $outerr "symlink \"".&$from_print($File::Find::name)."\": \"";
Packit 244994
					print $outerr "".&$from_print($oldlink)."\" >> \"";
Packit 244994
					&print_ask (&$to_print($newname)."\"",$opt_i) or return;
Packit 244994
					&save_parent_mtime($dir) if ($opt_mtimes);
Packit 244994
					if ($opt_notest) {
Packit 244994
						unlink $arg;
Packit 244994
						symlink ($newname, $arg);
Packit 244994
						print UNDOLOG "unlink \"".$File::Find::name."\";\n";
Packit 244994
						print UNDOLOG "symlink (\"$oldlink\", \"".$File::Find::name."\");\n";
Packit 244994
					} elsif ($opt_parsable) {
Packit 244994
						print "unlink".$del.$File::Find::name.$fin_del;
Packit 244994
						print "symlink".$del.$newname.$del.$File::Find::name.$fin_del;
Packit 244994
					}
Packit 244994
					$ops++;
Packit 244994
				} else {
Packit 244994
					print $outerr "link target \"",&$from_print($oldlink),"\" of \"",&$from_print($dir."/".$arg),"\" not in subtree, left untouched!\n";
Packit 244994
				}
Packit 244994
			} # else { print "no need to convert link target: $oldlink to $newname\n"; }
Packit 244994
		}
Packit 244994
	}
Packit 244994
	chdir $pwd;
Packit 244994
}
Packit 244994
Packit 244994
# do the changes to all the real files/dirs/links:
Packit 244994
sub process_main {
Packit 244994
	$arg=$_;
Packit 244994
	&get_dir_base_change;
Packit 244994
	if (-l $arg) {
Packit 244994
#		$type="symlink";
Packit 244994
		$newname=&$get_newname($arg);
Packit 244994
		if ($newname and $newname ne $arg) {
Packit 244994
			&renameit($arg,$newname);
Packit 244994
		}
Packit 244994
	} elsif (-d $arg) {
Packit 244994
#		$type="directory";
Packit 244994
		$newname=&$get_newname($arg);
Packit 244994
		if ($newname and $newname ne $arg) {
Packit 244994
			&renameit($arg,$newname);
Packit 244994
		}
Packit 244994
		&restore_times_if_any($dir,$arg,$newname) if ($opt_mtimes);
Packit 244994
	} elsif (-f $arg) {
Packit 244994
#		$type="file";
Packit 244994
		$newname=&$get_newname($arg);
Packit 244994
		if ($newname and $newname ne $arg) {
Packit 244994
			&renameit($arg,$newname);
Packit 244994
		}
Packit 244994
	}
Packit 244994
	# &restore_times_if_any($dir,$arg,$newname) if ($opt_mtimes); # only in -d case needed !? -> moved up!
Packit 244994
Packit 244994
	chdir $pwd;
Packit 244994
}
Packit 244994
Packit 244994
sub char_get_newname {
Packit 244994
# returns undef on error and string otherwise.
Packit 244994
	my $oldfile=shift;
Packit 244994
	my $newname;
Packit 244994
	my $lets_die = 0;
Packit 244994
	if (!$from_is_utf8 and $to_is_utf8 and !$opt_nosmart and &looks_like_utf8($oldfile)) {
Packit 244994
		if ($opt_parsable) {
Packit 244994
			print "іnfomsg".$del."skipalreadyutf8".$del.$dir."/".$oldfile.$fin_del;
Packit 244994
		} else {
Packit 244994
			print $outerr "Skipping, already UTF-8: ",&$from_print($dir."/".$oldfile),"\n";
Packit 244994
		}
Packit 244994
		return $oldfile;
Packit 244994
	} else {
Packit 244994
		if ($opt_fixdouble and not looks_like_utf8($oldfile)) {
Packit 244994
			# this is legacy encoding which we ignore in fixdouble mode
Packit 244994
			return $oldfile;
Packit 244994
		}
Packit 244994
		if ($from_is_utf8 and ! $to_is_utf8) {
Packit 244994
			# from_to can't convert from NFD to non-UTF-8!
Packit 244994
			$newname=encode_utf8(NFC(decode_utf8($oldfile)));
Packit 244994
		} else {
Packit 244994
			$newname=$oldfile;
Packit 244994
		}
Packit 244994
		from_to($newname, $opt_f, $opt_t, Encode::FB_QUIET) or $lets_die = 1;
Packit 244994
		if ($opt_fixdouble and not looks_like_utf8($newname)) {
Packit 244994
			return $oldfile;
Packit 244994
		}
Packit 244994
		if ($lets_die) {
Packit 244994
			die "SHOULD NOT HAPPEN HERE: conversion error, no suitable charset used?: \"$oldfile\"\nTo prevent damage to your files, we won't continue. First fix this!\n";
Packit 244994
		}
Packit 244994
		$newname=&$norm(decode_utf8($newname)) if ($to_is_utf8);
Packit 244994
		return $newname;
Packit 244994
	}
Packit 244994
	
Packit 244994
}
Packit 244994
Packit 244994
sub get_dir_base_change() {
Packit 244994
	$arg =~ s/\/*$//;
Packit 244994
	$dir=dirname($arg);
Packit 244994
	$arg=basename($arg);
Packit 244994
	chdir $dir;
Packit 244994
}
Packit 244994
Packit 244994
sub renameit() {
Packit 244994
	my $oldfile=shift;
Packit 244994
	my $newname=shift;
Packit 244994
	my $cmd;
Packit 244994
	my $ci_old_new_same_inode = 0;
Packit 244994
	$newname=encode_utf8($newname) if ($to_is_utf8);
Packit 244994
	if ($opt_exec) {
Packit 244994
				$cmd = $opt_exec;
Packit 244994
				$cmd =~ s/\#2/\000f8d9hqoäd\#2/g; # make the #2 unique so that file names may contain "#2"
Packit 244994
				$cmd =~ s/\#1/\Q$oldfile\E/g;
Packit 244994
				$cmd =~ s/\000f8d9hqoäd\#2/\Q$newname\E/g;
Packit 244994
				print "$cmd\n";
Packit 244994
	} else {
Packit 244994
		#print is_utf8($oldfile) ? 1 : 0;
Packit 244994
		#print is_utf8($newname) ? 1 : 0;
Packit 244994
		&print_ask ("mv \"". &$from_print($dir."/".$oldfile)."\"\t\"".&$from_print($dir)."/".&$to_print($newname)."\"",$opt_i) or return;
Packit 244994
	}
Packit 244994
	&save_parent_mtime($dir) if ($opt_mtimes);
Packit 244994
	# the following is to handle case-insensitive filesystems:
Packit 244994
	if ($opt_lower or $opt_upper) {
Packit 244994
		if ((stat $oldfile)[1] == (stat $newname)[1]) {
Packit 244994
			$ci_old_new_same_inode = 1;
Packit 244994
			#print $outerr "found case-insensitive filesystem...\n";
Packit 244994
			if ($opt_notest) {
Packit 244994
				my $try;
Packit 244994
				for (1 .. 10000) {
Packit 244994
					$try = "convmvtmp".$_;
Packit 244994
					if (! -e $try) {
Packit 244994
						#print $outerr "(via temp rename to \"$try\")\n";
Packit 244994
						rename ($oldfile, $try);
Packit 244994
						$opt++;
Packit 244994
						print UNDOLOG "rename (\"$dir/$try\", \"$dir/$oldfile\");\n";
Packit 244994
						$oldfile=$try;
Packit 244994
						last;
Packit 244994
					}
Packit 244994
				}
Packit 244994
			}
Packit 244994
		}
Packit 244994
	}
Packit 244994
	if (-e $newname and !$opt_exec and !$ci_old_new_same_inode) {
Packit 244994
		if ($opt_replace and !&compare($oldfile,$newname)) {
Packit 244994
			if ($opt_notest) {
Packit 244994
				unlink $newname or print $outerr "Error: $!\n";
Packit 244994
				rename ($oldfile, $newname) or print $outerr "Error: $!\n";
Packit 244994
			} elsif ($opt_parsable) {
Packit 244994
				print "unlink".$del.$dir."/".$oldfile.$fin_del;
Packit 244994
				print "rename".$del.$dir."/".$oldfile.$del.$dir."/".$newname.$fin_del;
Packit 244994
			}
Packit 244994
		} else {
Packit 244994
			if ($opt_parsable) {
Packit 244994
				print "errormsg".$del."fileexists".$del.$newname.$fin_del;
Packit 244994
			} else {
Packit 244994
				print $outerr "".&$to_print($newname)," exists and differs or --replace option missing - skipped\n";
Packit 244994
			}
Packit 244994
		}
Packit 244994
	} else {
Packit 244994
		if ($opt_notest) {
Packit 244994
			if ($opt_exec) {
Packit 244994
				system($cmd);
Packit 244994
			} else {
Packit 244994
				rename ($oldfile, $newname) or print $outerr "Error: $!\n";
Packit 244994
				print UNDOLOG "rename (\"$dir/$newname\", \"$dir/$oldfile\");\n";
Packit 244994
			}
Packit 244994
		} elsif ($opt_parsable) {
Packit 244994
			print "rename".$del.$dir."/".$oldfile.$del.$dir."/".$newname.$fin_del;
Packit 244994
		}
Packit 244994
	}
Packit 244994
	$ops++;
Packit 244994
}
Packit 244994
Packit 244994
sub save_parent_mtime() {
Packit 244994
	my $dir=shift;
Packit 244994
	$dir =~ s/^\.\///;
Packit 244994
	$dir =~ s/\.\/$//;
Packit 244994
#	return if ($dir eq "."); # broken !?
Packit 244994
	return if (exists $dir_time_hash{$dir});
Packit 244994
	#print $outerr "Putting \"$dir\" in %dir_time_hash\n"; # debug print
Packit 244994
	@{$dir_time_hash{$dir}}=(stat("."))[8..10];
Packit 244994
}
Packit 244994
Packit 244994
sub restore_times_if_any() {
Packit 244994
	my $dir=shift;
Packit 244994
	my $old=shift;
Packit 244994
	my $new=shift;
Packit 244994
	if ($dir eq ".") {
Packit 244994
		$dir = "";
Packit 244994
	} else {
Packit 244994
		$dir .= "/";
Packit 244994
	}
Packit 244994
	$dir .= $old;
Packit 244994
	# print $outerr "Trying to delete \"$dir\" now \"$new\" from %dir_time_hash\n"; # debug print
Packit 244994
	if (exists $dir_time_hash{$dir}) {
Packit 244994
		if ($opt_notest) {
Packit 244994
			utime ${$dir_time_hash{$dir}}[0], ${$dir_time_hash{$dir}}[1], $new or print $outerr "Could not run utime() on $new: $!\n";
Packit 244994
			print UNDOLOG "utime ".${$dir_time_hash{$dir}}[0].", ".${$dir_time_hash{$dir}}[1].", ".$new." or print \"Could not run utime() on $new: \$!\n\"";
Packit 244994
		} elsif ($opt_parsable) {
Packit 244994
			print "utime".$del.$dir.$del.${$dir_time_hash{$dir}}[0].$del.${$dir_time_hash{$dir}}[1].$del.${$dir_time_hash{$dir}}[2].$fin_del;
Packit 244994
		}
Packit 244994
		delete $dir_time_hash{$dir};
Packit 244994
		#print $outerr "done\n"; # debug print
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub listvalidencodings() {
Packit 244994
	print "$_\n" for (Encode->encodings(":all"));
Packit 244994
	return 1;
Packit 244994
}
Packit 244994
Packit 244994
sub dumpoptions() {
Packit 244994
	for (keys %opts) {
Packit 244994
		s/=.*//;
Packit 244994
		print "-" if (length($_) > 1);
Packit 244994
		print "-$_\n";
Packit 244994
	}
Packit 244994
	return 1;
Packit 244994
}
Packit 244994
Packit 244994
sub fixdouble_checkenc() {
Packit 244994
	return 1;
Packit 244994
}
Packit 244994
Packit 244994
sub char_checkenc() {
Packit 244994
	my $oldfile=shift;
Packit 244994
	my $new=$oldfile;
Packit 244994
	if ($from_is_utf8) {
Packit 244994
		if (! &$this_is_valid_utf8($new)) {
Packit 244994
			if ($opt_parsable) {
Packit 244994
				print "errormsg".$del."filenotutf8".$del.$oldfile.$fin_del;
Packit 244994
			} else {
Packit 244994
				print $outerr "this file was not validly encoded in UTF-8: \"". &$from_print($dir."/".$oldfile) ."\"\n";
Packit 244994
				}
Packit 244994
			return undef;
Packit 244994
		}
Packit 244994
	} else {
Packit 244994
		if ($to_is_utf8 and !$opt_nosmart and &looks_like_utf8($oldfile)) {
Packit 244994
			# do nothing: e.g. from_enc is shift_jis but string is utf-8. Should
Packit 244994
			# be "smart-skipped" if to_enc is utf-8 and not produce no error here.
Packit 244994
		}
Packit 244994
		elsif (! from_to($new,$opt_f, "utf8", Encode::FB_QUIET) ) {
Packit 244994
			if ($opt_parsable) {
Packit 244994
				print "errormsg".$del."fileencodedinvalid".$del.$dir."/".$oldfile.$fin_del;
Packit 244994
			} else {
Packit 244994
				print $outerr "this file was not validly encoded in $opt_f: \"". &$from_print($dir."/".$oldfile) ."\"\n";
Packit 244994
			}
Packit 244994
			return undef;
Packit 244994
		}
Packit 244994
	}
Packit 244994
	# $new is utf-8 now and $oldfile's encoding was valid ...
Packit 244994
	my $filenamelength;
Packit 244994
	if ($to_is_utf8) {
Packit 244994
		$new = &$norm(decode_utf8($new));
Packit 244994
		$filenamelength=length($new);
Packit 244994
	} else {
Packit 244994
		$new=encode_utf8(NFC(decode_utf8($new)));
Packit 244994
		$filenamelength=from_to($new, "utf8", $opt_t, Encode::FB_QUIET);
Packit 244994
	}
Packit 244994
##	print "$oldfile|$utf8oldfile|$new|$filenamelength\n";
Packit 244994
	if (! $filenamelength) {
Packit 244994
		if ($opt_parsable) {
Packit 244994
			print "errormsg".$del."charsetdoesntcoverneededcharacters".$del.$dir."/".$oldfile.$fin_del;
Packit 244994
		} else {
Packit 244994
			print $outerr "$opt_t doesn't cover all needed characters for: \"". &$from_print($dir."/".$oldfile) ."\"\n";
Packit 244994
		}
Packit 244994
		return undef;
Packit 244994
	} elsif ($filenamelength > $maxfilenamelength) {
Packit 244994
		print $outerr "".&$from_print($dir."/".$oldfile).": resulting filename is $filenamelength bytes long (max: $maxfilenamelength)\n";
Packit 244994
		return undef;
Packit 244994
	}
Packit 244994
	&posix_check($new);
Packit 244994
	return 1;
Packit 244994
}
Packit 244994
Packit 244994
sub printusage {
Packit 244994
	&check_for_perl_bugs;
Packit 244994
	print <
Packit 244994
convmv 2.01 - converts filenames from one encoding to another
Packit 244994
Copyright (C) 2003-2017 Bjoern JACKE <bjoern\@j3e.de>
Packit 244994
Packit 244994
This program comes with ABSOLUTELY NO WARRANTY; it may be copied or modified
Packit 244994
under the terms of the GNU General Public License version 2 or 3 as published
Packit 244994
by the Free Software Foundation.
Packit 244994
Packit 244994
 USAGE: convmv [options] FILE(S)
Packit 244994
-f enc     encoding *from* which should be converted
Packit 244994
-t enc     encoding *to* which should be converted
Packit 244994
-r         recursively go through directories
Packit 244994
-i         interactive mode (ask for each action)
Packit 244994
--nfc      target files will be normalization form C for UTF-8 (Linux etc.)
Packit 244994
--nfd      target files will be normalization form D for UTF-8 (OS X etc.)
Packit 244994
--qfrom    be quiet about the "from" of a rename (if it screws up your terminal e.g.)
Packit 244994
--qto      be quiet about the "to" of a rename (if it screws up your terminal e.g.)
Packit 244994
--exec c   execute command instead of rename (use #1 and #2 and see man page)
Packit 244994
--list     list all available encodings
Packit 244994
--lowmem   keep memory footprint low (see man page)
Packit 244994
--map m    apply an additional character mapping
Packit 244994
--nosmart  ignore if files already seem to be UTF-8 and convert if posible
Packit 244994
--notest   actually do rename the files
Packit 244994
--replace  will replace files if they are equal
Packit 244994
--unescape convert%20ugly%20escape%20sequences
Packit 244994
--upper    turn to upper case
Packit 244994
--lower    turn to lower case
Packit 244994
--parsable write a parsable todo list (see man page)
Packit 244994
--help     print this help
Packit 244994
END
Packit 244994
#--dotlessi care about the dotless i issue of certain locales (use with care)
Packit 244994
}
Packit 244994
Packit 244994
sub looks_like_utf8() {
Packit 244994
	my $string = shift;
Packit 244994
	if ($string =~ m/[^[:ascii:]]/ and &$this_is_valid_utf8($string)) {
Packit 244994
		return 1;
Packit 244994
	} else {
Packit 244994
		return undef;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub this_is_valid_utf8_decode {
Packit 244994
	my $string = shift;
Packit 244994
	if (not defined(decode_utf8($string))) {
Packit 244994
		return undef;
Packit 244994
	} else {
Packit 244994
		return 1;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub this_is_valid_utf8_decode_CROAK() {
Packit 244994
	my $string = shift;
Packit 244994
	# until 1.08 I used to use decode_utf8() but see perl bug #37757 (perl 5.8.7/8)
Packit 244994
	#if (not defined(decode_utf8($string)) ) {
Packit 244994
	#
Packit 244994
	# let's look for a different way to find valid utf-8 ...:
Packit 244994
	# utf8::decode() is experimental and might disappear says utf8(3pm):
Packit 244994
	#if (utf8::decode($string) != undef) {
Packit 244994
	#
Packit 244994
	# Encode::decode does not work as one might expect:
Packit 244994
	#if (Encode::decode(utf8,$string,Encode::FB_QUIET) == undef) {
Packit 244994
	#
Packit 244994
	# from_to() works for all Perl versions (at the moment ;)
Packit 244994
	# ... and here we go: with Perl 5.10 from_to(utf8..utf8) doen't work either,
Packit 244994
	# see perl bug #49830. convmv 1.10 and Perl 5.10 will again only work with
Packit 244994
	# --nosmart.
Packit 244994
	#
Packit 244994
	# okay, now perluniintro suggests to do this:
Packit 244994
Packit 244994
	eval 'decode_utf8($string, Encode::FB_CROAK);';
Packit 244994
	if ($@) {
Packit 244994
		return undef;
Packit 244994
	} else {
Packit 244994
		return 1;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub to_ascii() {
Packit 244994
	my $a=shift;
Packit 244994
	$a =~ s/[^[:ascii:]]/?/g;
Packit 244994
	$a =~ s/[[:cntrl:]]/*/g;
Packit 244994
	return $a;
Packit 244994
}
Packit 244994
Packit 244994
sub dummy() {
Packit 244994
	return shift;
Packit 244994
}
Packit 244994
Packit 244994
sub print_ask() { # takes 2 arguments, string and askornot
Packit 244994
	if ($opt_parsable) {
Packit 244994
		return 1;
Packit 244994
	}
Packit 244994
	my $a="";
Packit 244994
	print shift;
Packit 244994
	my $ask = shift;
Packit 244994
	while ($ask and not $a =~ m/^[yn]$/i) {
Packit 244994
		print " (y/n) ";
Packit 244994
		$a=<STDIN>;
Packit 244994
	}
Packit 244994
	print "\n";
Packit 244994
	if ($a =~ m/^n$/i) {
Packit 244994
		return undef;
Packit 244994
	} else {
Packit 244994
		return 1;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub unescape_checkenc() {
Packit 244994
	my $name = shift;
Packit 244994
	if ($name =~ m/^[[:ascii:]]*$/) { # should we be more strict ?
Packit 244994
		&posix_check(&unescape_get_newname($name));
Packit 244994
		return 1;
Packit 244994
	} else {
Packit 244994
		if ($opt_parsable) {
Packit 244994
			print "errormsg".$del."notanescapedfile".$del.$name.$fin_del;
Packit 244994
		} else {
Packit 244994
			print $outerr "\"",&$from_print($name),"\" not ASCII - this does not seem to be an escaped filename.\n";
Packit 244994
		}
Packit 244994
		return undef;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub map_get_newname() {
Packit 244994
	$_ = shift;
Packit 244994
	return $_ if ($_ eq "." or $_ eq "..");
Packit 244994
	_utf8_on($_); # this is needed for tr/multibyte/non-multibyte/ to work! Otherwise we would
Packit 244994
	              # have to make a s/// for each character, grrr...
Packit 244994
	if ($opt_map eq "ntfs-sfm") { # see MS KB 117258 (but map : instead of /
Packit 244994
		tr/\x01-\x1f\"\*\:\<\>\?\\\|/\x{f001}-\x{f027}/;
Packit 244994
		s/ $/\x{f028}/;  # Space, only if occurring as the last character of the name
Packit 244994
		s/\.$/\x{f029}/; # period, only if occurring as the last character of the name
Packit 244994
	} elsif ($opt_map eq "ntfs-sfm-undo") {
Packit 244994
		tr/\x{f001}-\x{f027}/\x01-\x1f"*:<>?\\| /;
Packit 244994
		s/\x{f028}$/ /;  # Space, only if occurring as the last character of the name
Packit 244994
		s/\x{f029}$/./;  # period, only if occurring as the last character of the name
Packit 244994
	} elsif ($opt_map eq "ntfs-sfu") { # +0xF000, see MS KB ???? anyone knows a link or has archived an old one?
Packit 244994
		tr/\x01-\x1f\"\*\/\<\>\?\\\|/\x{f001}-\x{f01f}\x{f022}\x{f02a}\x{f02f}\x{f03c}\x{f03e}\x{f03f}\x{f05c}\x{f07c}/;
Packit 244994
		#??? s/ $/space/;  # Space, only if occurring as the last character of the name
Packit 244994
		#??? s/\.$/period/; # period, only if occurring as the last character of the name
Packit 244994
	} elsif ($opt_map eq "ntfs-sfu-undo") {
Packit 244994
		tr/\x{f001}-\x{f01f}\x{f022}\x{f02a}\x{f02f}\x{f03c}\x{f03e}\x{f03f}\x{f05c}\x{f07c}/\x01-\x1f"*\/<>?\\|/;
Packit 244994
		#??? s/space$/ /;  # Space, only if occurring as the last character of the name
Packit 244994
		#??? s/period$/./; # period, only if occurring as the last character of the name
Packit 244994
	} elsif ($opt_map eq "ntfs-pretty") {
Packit 244994
		s/\"/”/g;  # U+201D
Packit 244994
		s/\*/∗/g;  # U+2731
Packit 244994
		s/\?/?/g; # U+FF1F
Packit 244994
		s/\:/꞉/g;  # U+A789
Packit 244994
		s/\</</g; # U+FF1C
Packit 244994
		s/\>/>/g; # U+FF1E
Packit 244994
		s/\|/❘/g;  # U+2758
Packit 244994
		s/\\/\/g; # U+FF3C
Packit 244994
	} elsif ($opt_map eq "ntfs-pretty-undo") {
Packit 244994
		s/”/"/g;   # U+201D
Packit 244994
		s/∗/*/g;   # U+2731
Packit 244994
		s/?/?/g;  # U+FF1F
Packit 244994
		s/꞉/:/g;   # U+A789
Packit 244994
		s/</
Packit 244994
		s/>/>/g;  # U+FF1E
Packit 244994
		s/❘/|/g;   # U+2758
Packit 244994
		s/\/\\/g; # U+FF3C
Packit 244994
	} else {
Packit 244994
		die "map parameter \"$opt_map\" not supported. Use one of ",
Packit 244994
		    "ntfs-sfm, ntfs-sfm-undo, ",
Packit 244994
		    "ntfs-sfu, ntfs-sfu-undo, ",
Packit 244994
		    "ntfs-pretty, ntfs-pretty-undo\n";
Packit 244994
	}
Packit 244994
	return $_;
Packit 244994
}
Packit 244994
Packit 244994
sub unescape_get_newname() { # return undef on error, string otherwise
Packit 244994
	my $newname = shift;
Packit 244994
#	$newname =~ s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg; # this was done before
Packit 244994
	$newname =~ s/(%)([0-9a-fA-F][0-9a-fA-F])/chr(hex($2))/eg;
Packit 244994
	return $newname;
Packit 244994
}
Packit 244994
Packit 244994
Packit 244994
sub upperlower_checkenc() {
Packit 244994
	my $oldname = shift;
Packit 244994
	my $newname = upperlower_get_newname($oldname);
Packit 244994
	if (not defined($newname)) {
Packit 244994
		return undef;
Packit 244994
	} else {
Packit 244994
		&posix_check($newname);
Packit 244994
		return 1;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
sub upperlower_get_newname() {
Packit 244994
# return undef on error, string otherwise
Packit 244994
	my $oldname = shift;
Packit 244994
	my $name=$oldname;
Packit 244994
	if (! from_to($name, $opt_f, "utf8", Encode::FB_QUIET)) { # should also leave NFD as it is ...
Packit 244994
		print $outerr "\"",&$from_print($oldname),"\" not encoded in $opt_f ? Supply the correct encoding via -f option!\n";
Packit 244994
		return undef;
Packit 244994
	}
Packit 244994
	_utf8_on($name);	# Unicode in Perl can be a real pain ...
Packit 244994
	no bytes;
Packit 244994
	if ($opt_upper) {
Packit 244994
		if ($opt_dotlessi) {
Packit 244994
			$name =~ s/ı/I/g;
Packit 244994
			$name =~ s/i/İ/g;
Packit 244994
		}
Packit 244994
		# we do not want to upper ß to SS ! Let's substitute it with
Packit 244994
		# NUL+DWSLQH (NUL may not be part of filename) and get it back after uc().
Packit 244994
		# Unicode 5.1(draft) news: Uppercasing U+00DF (ß) LATIN SMALL LETTER SHARP S
Packit 244994
		# to the new U+1E9E LATIN CAPITAL LETTER SHARP S.
Packit 244994
		# but until now I don't see use for this in filenames ...
Packit 244994
		$name =~ s/ß/\000DWSLQH/g;
Packit 244994
		$name = uc($name);
Packit 244994
		$name =~ s/\000DWSLQH/ß/g;
Packit 244994
	} else {
Packit 244994
		if ($opt_dotlessi) {
Packit 244994
			$name =~ s/I/ı/g;
Packit 244994
			$name =~ s/İ/i/g;
Packit 244994
		}
Packit 244994
		$name = lc($name);
Packit 244994
	}
Packit 244994
	use bytes;
Packit 244994
	_utf8_off($name);
Packit 244994
	# we should also do special treatment for UTF-8 NFD of "I with dot above" in byte mode now, otherwise we get "i̇", which is a double-single dotted i ;-)
Packit 244994
	# the problems that arise with this letter are endless ...
Packit 244994
#	$name =~ s/i\314\207/i/g if ($from_is_utf8);
Packit 244994
	if (! from_to($name, "utf8", $opt_f, Encode::FB_QUIET)) {
Packit 244994
		print $outerr $opt_upper?"Upper":"Lower","case of \"",&$from_print($oldname),"\" not possible in $opt_f ! Maybe supply different encoding via -f option.\n";
Packit 244994
		return undef;
Packit 244994
	}
Packit 244994
	return $name;
Packit 244994
}
Packit 244994
Packit 244994
sub posix_check() {
Packit 244994
	my $name=shift;
Packit 244994
	if ($name =~ m/[\000\/]/) {
Packit 244994
		print $outerr "WARNING: new filename \"",&$to_print($name),"\" contains characters, which are not POSIX filesystem conform! This may result in data loss.\n";
Packit 244994
		$warnings_occurred=1;
Packit 244994
	}
Packit 244994
}
Packit 244994
Packit 244994
# still unused, but might be used for Netatalk CAP encoding:
Packit 244994
sub cap2utf8() {
Packit 244994
	my $oldname = shift;
Packit 244994
	if (($oldname !~ m/^:2eDS_Store/) and ($oldname =~ /:/)) {
Packit 244994
		$oldname =~ s/(:([0-9a-f][0-9a-f]))/chr(hex($2))/eg;
Packit 244994
	}
Packit 244994
	return $oldname;
Packit 244994
} 
Packit 244994
Packit 244994
sub check_for_broken_perl_release() {
Packit 244994
	# Check that most basic Perl Encode features we use work reliably
Packit 244994
	# and decide which code path we use for &this_is_valid_utf8():
Packit 244994
	my $test = ""."\366";
Packit 244994
	my $error = "";
Packit 244994
Packit 244994
	if (not defined(decode_utf8($test))) {
Packit 244994
		$this_is_valid_utf8=\&this_is_valid_utf8_decode;
Packit 244994
		return 0;
Packit 244994
	}
Packit 244994
	$error .= "decode_utf8(\$test) check failed\n";
Packit 244994
	
Packit 244994
	eval 'decode_utf8($test, Encode::FB_CROAK);';
Packit 244994
	if (not $@) {
Packit 244994
		$error .= "eval 'decode_utf8(\$non-utf8, Encode::FB_CROAK);'; check failed.\n";
Packit 244994
	} else {
Packit 244994
		$test = ""."ö";
Packit 244994
		eval 'decode_utf8($test, Encode::FB_CROAK);';
Packit 244994
		if ($@) {
Packit 244994
			$error .= "eval 'decode_utf8(\$utf8, Encode::FB_CROAK);'; check failed.\n";
Packit 244994
		} else {
Packit 244994
			$this_is_valid_utf8=\&this_is_valid_utf8_decode_CROAK;
Packit 244994
			return 0;
Packit 244994
		}
Packit 244994
	}
Packit 244994
	print "Your Perl release is too broken to make convmv work reliably:\n",$error;
Packit 244994
	exit 1;
Packit 244994
}
Packit 244994
Packit 244994
sub check_for_perl_bugs() {
Packit 244994
	# Check for certain Perl fleas that we more or less have to work around:
Packit 244994
	# until 1.08 I used to use decode_utf8() but see perl bug #37757 (perl 5.8.7/8)
Packit 244994
	#if (not defined(decode_utf8($string)) )
Packit 244994
	my $bugs = "";
Packit 244994
	my $test = "\366";
Packit 244994
Packit 244994
	my $u8test = NFD(""."ö"); 	# "". is intended as only so we have _utf8_off set
Packit 244994
					# otherwise from_to doesn't convert the $data to
Packit 244994
					# something else.
Packit 244994
	# print "DEBUG: string is UTF-8 flagged: ",is_utf8($u8test) ? "yes" : "no","\n";
Packit 244994
	eval "from_to($u8test, 'utf8', 'iso-8859-1');";
Packit 244994
	if ($u8test ne "\366") {
Packit 244994
		# Perl::Encode guys think that conversion from decomposed UTF-8
Packit 244994
		# to any other charset does not have to be supported by from_to.
Packit 244994
		# Why, when NFC or NFD this is both perfectly valid UTF-8?
Packit 244994
		$bugs .= "#22111 ";
Packit 244994
	}
Packit 244994
	if (decode_utf8($test)) {
Packit 244994
		$bugs .= "#37757 ";
Packit 244994
		# Convmv 1.08 and below would not work here!
Packit 244994
		# Perl documentation up to 5.8.8 said that
Packit 244994
		# decode_utf8($data_that_is_not_utf_8) should return undef
Packit 244994
	}
Packit 244994
	if (! from_to($test,utf8,utf8,Encode::FB_QUIET) == undef) {
Packit 244994
		$bugs .= "#49830 ";
Packit 244994
		# convmv 1.10-1.11 would not work here!
Packit 244994
		# broken UTF-8 is silently being converted to sane UTF-8 without throwing
Packit 244994
		# an error. 
Packit 244994
	}
Packit 244994
	if ($bugs) {
Packit 244994
		print "Your Perl version has fleas $bugs\n";
Packit 244994
	}
Packit 244994
Packit 244994
}