Blame ligature/rmligs.skel

Packit e20377
#!/usr/bin/perl
Packit e20377
#
Packit e20377
# rmligs 0.87 - ligature corrector for German LaTeX documents
Packit e20377
# Copyright (C) 1999-2012 Björn Jacke <bjoern@j3e.de>
Packit e20377
#
Packit e20377
# This program comes with ABSOLUTELY NO WARRANTY; it may be copied or modified
Packit e20377
# under the terms of the GNU General Public License version 2 or 3 as
Packit e20377
# published by the Free Software Foundation.
Packit e20377
#
Packit e20377
# Please send bug reports, missing words and suggestions to bjoern@j3e.de
Packit e20377
###############################################################################
Packit e20377
#
Packit e20377
# rmligs is a program for removing incorrectly used ligatures from
Packit e20377
# LaTeX documents. This version is intended for German language texts only.
Packit e20377
# The wordlist data is taken from the igerman98 dictionary available on
Packit e20377
# https://www.j3e.de/ispell/igerman98/ .
Packit e20377
# Up-to-date versions of rmligs can be found on CTAN
Packit e20377
#
Packit e20377
######################################################################
Packit e20377
#
Packit e20377
#  Usage:
Packit e20377
#       rmligs [options] FILE(s) ...
Packit e20377
#
Packit e20377
#  -f, --filter       filter mode for streaming
Packit e20377
#  -t, --test         test mode (read-only)
Packit e20377
#  -i, --interactive  prompt before applying changes
Packit e20377
#  -q, --quiet        be quite quiet
Packit e20377
#
Packit e20377
##############################################################
Packit e20377
Packit e20377
use Getopt::Long;
Packit e20377
Getopt::Long::Configure ("bundling");
Packit e20377
Packit e20377
$debug = 0;
Packit e20377
$outerr = STDERR;
Packit e20377
$outmesg = STDOUT;
Packit e20377
Packit e20377
GetOptions('f'=>\$opt_f, 'filter'=>\$opt_f, '' => \$opt_f, 't'=>\$opt_t, 'test'=>\$opt_t, 'q'=>\$opt_q, 'quiet'=>\$opt_q, 'i'=>\$opt_i, 'interactive'=>\$opt_i) or printusage();
Packit e20377
if ((($opt_f) and ((@ARGV) or ($opt_t) or ($opt_i)))
Packit e20377
		or (($opt_t) and ($opt_q)) ) {
Packit e20377
	print $outerr "\nIllegal combination of options!\n";
Packit e20377
	exit 1;
Packit e20377
}
Packit e20377
Packit e20377
if ($opt_q) {$outmesg = NUL;}
Packit e20377
Packit e20377
Packit e20377
if (@ARGV) {
Packit e20377
	printcopyright();
Packit e20377
	buildlighash();
Packit e20377
	
Packit e20377
	for (@ARGV) {
Packit e20377
		$oldfile = $_;
Packit e20377
		open(OLDFILE, "< $oldfile") or print $outerr "Can't open $oldfile: $!\n\n" and next;
Packit e20377
		$infile = OLDFILE;
Packit e20377
		unless ($opt_t) {
Packit e20377
			$newfile = "$_.tmp";
Packit e20377
			$bakfile = "$_.bak";
Packit e20377
			open(NEWFILE, "> $newfile") or die "Can't open $newfile: $!\n\n";
Packit e20377
			$outfile = NEWFILE;
Packit e20377
		}
Packit e20377
Packit e20377
		$corrlines = $linenumber = $jobs = 0;
Packit e20377
		processfile();
Packit e20377
		print $outmesg " >>> $jobs corrections in $corrlines of $linenumber lines in $oldfile\n\n";
Packit e20377
		
Packit e20377
		close(OLDFILE) or print $outerr "Can't close $oldfile: $!\n\n";
Packit e20377
		
Packit e20377
		unless ($opt_t) {
Packit e20377
			unless ($jobs == 0) {
Packit e20377
				close(NEWFILE) or die "Can't close $newfile $!\n";
Packit e20377
				rename($oldfile,$bakfile) or die "Can't rename $oldfile to $bakfile: $!\n" and next;
Packit e20377
				rename($newfile,$oldfile) or die "Can't rename $newfile to $oldfile: $!\n" and next;
Packit e20377
			}
Packit e20377
			else {unlink($newfile);}
Packit e20377
		}
Packit e20377
		else {print $outmesg "NO file changes (running read-only mode)\n";}
Packit e20377
	}
Packit e20377
}
Packit e20377
elsif ($opt_f) {
Packit e20377
	buildlighash();
Packit e20377
	$outmesg = NUL;
Packit e20377
	$infile = STDIN;
Packit e20377
	$outfile = STDOUT;
Packit e20377
	processfile();
Packit e20377
}
Packit e20377
else { printusage(); }
Packit e20377
Packit e20377
Packit e20377
Packit e20377
###########################
Packit e20377
Packit e20377
Packit e20377
sub printcopyright {
Packit e20377
	print $outmesg <
Packit e20377
Packit e20377
rmligs 0.87 - ligature corrector for German LaTeX documents
Packit e20377
Copyright (C) 1999-2012 Bjoern Jacke <bjoern\@j3e.de>
Packit e20377
Packit e20377
This program comes with ABSOLUTELY NO WARRANTY; it may be copied or modified
Packit e20377
under the terms of the GNU General Public License version 2 or 3 as
Packit e20377
published by the Free Software Foundation.
Packit e20377
Packit e20377
Please send bug reports, missing words and suggestions to bjoern\@j3e.de
Packit e20377
Packit e20377
END
Packit e20377
}
Packit e20377
Packit e20377
####
Packit e20377
sub printusage {
Packit e20377
	printcopyright();
Packit e20377
	print $outmesg <
Packit e20377
USAGE:  rmligs [options] FILE(S) ...
Packit e20377
Packit e20377
  -f, --filter       filter mode for streaming
Packit e20377
  -t, --test         test mode (read-only)
Packit e20377
  -i, --interactive  prompt before applying changes
Packit e20377
  -q, --quiet        be quite quiet
Packit e20377
Packit e20377
END
Packit e20377
	exit 1
Packit e20377
}
Packit e20377
Packit e20377
####
Packit e20377
sub buildlighash {
Packit e20377
	while (<DATA>) {
Packit e20377
		chomp;
Packit e20377
		s/ß/ss/g; # we store "ss" in spite of "ß" -- also good for Swiss language
Packit e20377
		s/ä/ae/g;
Packit e20377
		s/ö/oe/g;
Packit e20377
		s/ü/ue/g;
Packit e20377
		
Packit e20377
		my $key=$_;
Packit e20377
		$key =~ s/\|//g;
Packit e20377
		$lighash{$key} = $_;
Packit e20377
	}
Packit e20377
}
Packit e20377
Packit e20377
####
Packit e20377
sub processfile {
Packit e20377
	while (<$infile>) {
Packit e20377
		++$linenumber;
Packit e20377
		$linecorrected = 0;
Packit e20377
		for $word (m/[\\]*[\"a-zäöüÄÖÜß\303\244\204\266\274\234\237]*f[fil][\"a-zäöüÄÖÜß\303\244\204\266\274\234\237]*/gi) {
Packit e20377
			getlig();    # get our ligword ... if existing
Packit e20377
			if (defined $ligword and &check_interactive) {
Packit e20377
				changeword();
Packit e20377
				s/([^a-zA-ZäöüÄÖÜ\303\244\204\266\274\234\237\\]|^)\Q$word/$1$newword/g;
Packit e20377
				print $outmesg "line $linenumber: $word => $newword\n";  # let's be verbose!
Packit e20377
				++$jobs;
Packit e20377
				$linecorrected = 1;
Packit e20377
			}
Packit e20377
		}
Packit e20377
		print $outfile $_ unless $opt_t;
Packit e20377
		$corrlines += $linecorrected;
Packit e20377
	}
Packit e20377
}
Packit e20377
Packit e20377
####
Packit e20377
sub getlig {
Packit e20377
	print "XXX checking $word\n" if ($debug);
Packit e20377
	my $compword = $word;
Packit e20377
	$compword =~ s/^(\\\\)*//;  # \\ as newline is allowed; \bla is a command!
Packit e20377
	                            # We remove all 2*n \'s from beginning
Packit e20377
	$compword =~ s/(\303\244|\303\204)/ae/g;
Packit e20377
	$compword =~ s/(\303\266|\303\226)/oe/g;
Packit e20377
	$compword =~ s/(\303\274|\303\234)/ue/g;
Packit e20377
	$compword =~ s/(\303\237)/ss/g;
Packit e20377
	$compword =~ tr [A-ZÄÖÜ] [a-zäöü];
Packit e20377
	$compword =~ s/(\"a|ä)/ae/g;
Packit e20377
	$compword =~ s/(\"o|ö)/oe/g;
Packit e20377
	$compword =~ s/(\"u|ü)/ue/g;
Packit e20377
	$compword =~ s/(\"[sz]|ß)/ss/g;
Packit e20377
	$compword =~ s/\"$//;  # we want a trailing "' to be possible!
Packit e20377
	$ligword = $lighash{$compword};
Packit e20377
}
Packit e20377
Packit e20377
####
Packit e20377
sub changeword {
Packit e20377
	my $pos = index($ligword,'|');
Packit e20377
	my $runs = 0;
Packit e20377
	my $cut_pos = 0;
Packit e20377
	$newword = $word;
Packit e20377
	# to make all characters as long as the ones of the lighash (2 - only chars count!):
Packit e20377
	$newword =~ s/ß/qq/g;    # "qq" as a placeholder for iso-8859-1/15 ß
Packit e20377
	$newword =~ s/"/"z/g;    # umlauts encoded like "o need to be 2 chars long, too!
Packit e20377
	$newword =~ s/ä/\"ae/g;  #
Packit e20377
	$newword =~ s/Ä/\"AE/g;  #
Packit e20377
	$newword =~ s/ö/\"oe/g;  # "\"AE" etc. as placeholders für iso8859-1/15 umlauts
Packit e20377
	$newword =~ s/Ö/\"OE/g;  #
Packit e20377
	$newword =~ s/ü/\"ue/g;  #
Packit e20377
	$newword =~ s/Ü/\"UE/g;  #
Packit e20377
	# equivalent UTF-8 characters already are 2 bytes long ... nothing more to do.
Packit e20377
	print "XXX changeword: $newword\n" if ($debug);
Packit e20377
	do {
Packit e20377
		my $left = my $right = $newword;
Packit e20377
		  # delete all up to the first occurence of | in the ligword:
Packit e20377
		$cutpos = $pos - $runs++;
Packit e20377
		$right =~ s/^\\*([\|\"\ ]*[a-zäöüÄÖÜ\303\244\204\266\274\234\237]){$cutpos}//i;
Packit e20377
		  # remember the rest of the word (left side):
Packit e20377
		$left =~ s/($right)$//i;
Packit e20377
		  # glue left & right with "| :
Packit e20377
		$newword = "$left\"\|$right";
Packit e20377
		  # get the next(?) non-ligature:
Packit e20377
		$pos = index($ligword,'|',++$pos);
Packit e20377
	} until ($pos == -1);
Packit e20377
	$newword =~ s/qq/ß/g;    # to get back our "ß"
Packit e20377
	$newword =~ s/"z/"/g;    #
Packit e20377
	$newword =~ s/\"ae/ä/g;  #
Packit e20377
	$newword =~ s/\"AE/Ä/g;  #
Packit e20377
	$newword =~ s/\"oe/ö/g;  # ... and other umlauts
Packit e20377
	$newword =~ s/\"OE/Ö/g;  #
Packit e20377
	$newword =~ s/\"ue/ü/g;  #
Packit e20377
	$newword =~ s/\"UE/Ü/g;  #
Packit e20377
}
Packit e20377
Packit e20377
sub check_interactive {
Packit e20377
	if (not $opt_i) { return 1; }
Packit e20377
	my $a="";
Packit e20377
	do {
Packit e20377
		print STDOUT "Correct $word (l. $linenumber)? [y/n]";
Packit e20377
		read STDIN, $a, 1;
Packit e20377
		print "\n";
Packit e20377
	} until ($a =~ m/[yn]/);
Packit e20377
	return 1 if ($a eq "y");
Packit e20377
}
Packit e20377
Packit e20377
__END__