Blob Blame History Raw
#!/usr/bin/perl -- # -*- Perl -*-

# this needs some cleanup...

my $PSTOTEXT = "pstotext";

my $pdf = shift @ARGV;

my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while (<F>) {
    if (/^<\/index/) {
	$index .= $_;
	$inindex = 0;
    }
    $inindex = 1 if /^<index/;

    if ($inindex) {
	$index .= $_ if /^\s*</;
    }
}

my $cindex = "";
while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
    $cindex .= $1;
    $_ = $2;
    $index = $'; # '

    my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;

    # Expand ranges
    if ($#pages >= 0) {
	my @mpages = ();
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
		for (my $count = $1; $count <= $2; $count++) {
		    push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
		}
	    } else {
		push (@mpages, $page);
	    }
	}
	@pages = sort rangesort @mpages;
    }

    # Remove duplicates...
    if ($#pages > 0) {
	my @mpages = ();
	my $current = "";
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno ne $current) {
		push (@mpages, $page);
		$current = $pageno;
	    }
	}
	@pages = @mpages;
    }

    # Collapse ranges...
    if ($#pages > 1) {
	my @cpages = ();
	while (@pages) {
	    my $count = 0;
	    my $len = &rangelen($count, @pages);
	    if ($len <= 2) {
		my $page = shift @pages;
		push (@cpages, $page);
	    } else {
		my $fpage = shift @pages;
		my $lpage = "";
		while ($len > 1) {
		    $lpage = shift @pages;
		    $len--;
		}
		my $fpno = &pageno($fpage);
		my $lpno = &pageno($lpage);
		$fpage =~ s/>$fpno</>${fpno}-$lpno</s;
		push (@cpages, $fpage);
	    }
	}
	@pages = @cpages;
    }

    my $page = shift @pages;
    $page =~ s/\s*$//s;
    $cindex .= $page;
    while (@pages) {
	$page = shift @pages;
	$page =~ s/\s*$//s;
	$cindex .= ", $page";
    }
}
$cindex .= $index;

print "$cindex\n";

sub pageno {
    my $page = shift;

    $page =~ s/^<phrase.*?>//;
    $page =~ s/^<link.*?>//;

    return $1 if $page =~ /^([^<>]+)/;
    return "?";
}

sub rangesort {
    my $apno = &pageno($a);
    my $bpno = &pageno($b);

    # Make sure roman pages come before arabic ones, otherwise sort them in order
    return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
    return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
    return $apno <=> $bpno;
}

sub rangelen {
    my $count = shift;
    my @pages = @_;
    my $len = 1;
    my $inrange = 1;

    my $current = &pageno($pages[$count]);
    while ($count < $#pages && $inrange) {
	$count++;
	my $next = &pageno($pages[$count]);
	if ($current + 1 eq $next) {
	    $current = $next;
	    $inrange = 1;
	    $len++;
	} else {
	    $inrange = 0;
	}
    }

    return $len;
}