#! /bin/bash
#
# scramble -- split up files into entries assuming each entry
# starts with a given separator
# optionally, classify the entries
# produce, on stdout, a random list of entries and
# their locations and classes, or a stream of
# entries in random order, with classifiers if given
# needs: POSIX compliant sh, basename rm grep awk wc perl dd
# usage: see function usage() starting on line 14 of this file
# version: 0.12 (Greg Louis <glouis@dynamicro.on.ca>)
# Note: on Solaris, use /usr/xpg4/bin/sh -- /bin/sh will not work.
pid=$$
mbox=1 # mbox mode
mdir=2 # maildir mode
usage() {
iam=$(basename $0)
echo "Usage: $iam separator [-l] [-c classID] filename/directory [...]"
echo " Files contain entries, each of which begins with"
echo " text matching the separator. Entries are listed or"
echo " output in random order."
echo "Parameters:"
echo " separator is a regex used by grep that matches the"
echo " start of each entry."
echo " -l indicates that the output is to be a list of"
echo " entries. If this option is not given, the output"
echo " consists of the entries themselves."
echo " classID is text used to identify the class of a"
echo " given entry. If no classID values are specified,"
echo " this field consists of a single . character. If"
echo " classID values are given and the -l option is not"
echo " used, each entry in the output stream is preceded"
echo " by a single line of the form %%-CLASS-ClassID-%%."
echo " No one file may contain entries of more than one"
echo " class."
echo " File and directory names may not contain blanks."
# rm -f list.$pid
exit 1
}
doit()
{
# the first param is the separator
test "x$1" = "x" && usage
if [ "$1" != "-d" ] ; then
sep="$1"
mode="$mbox"
else
mode="$mdir"
fi
shift
stream=1
classID="."
if [ "$mode" = "$mbox" ] ; then # mbox
create_mbox_entries $*
else # maildir
create_maildir_entries $*
fi
output | perl \
-e' srand ( time() ^ ($$ + ($$ << 15)) );' \
-e' foreach $key (<>) {' \
-e' $shuf{$key} = rand;' \
-e' }' \
-e' foreach $key (sort { $shuf{$b} <=> $shuf{$a} } keys %shuf ) {' \
-e' print $key;' \
-e' }' >shuf.$pid
cat shuf.$pid
# next line can be commented out for debugging
rm list.$pid shuf.$pid
}
create_mbox_entries()
{
# get all the byte offsets in all the files, in one list
while [ ${#*} -gt 0 ]; do
if [ "$1" = "-l" ]; then
stream=0
shift
continue
fi
if [ "$1" = "-c" ]; then
classID=$2
shift 2
continue
fi
file=$1 ; shift
if [ ! -r $file ]; then echo "$file not found"; usage; fi
grep -a -b '^From ' $file | \
awk "BEGIN {FS=\":\"} {print \"$classID $file \"\$1}" >>list.$pid
wc -c $file | awk "{print \"$classID $file \"\$1}" >>list.$pid
done
}
create_maildir_entries()
{
# get all the byte offsets in all the files, in one list
while [ ${#*} -gt 0 ]; do
if [ "$1" = "-l" ]; then
stream=0
shift
continue
fi
if [ "$1" = "-c" ]; then
classID=$2
shift 2
continue
fi
file=$1 ; shift
if [ ! -r $file ]; then echo "$file not found"; usage; fi
for fnam in $file/* ; do echo $classID $fnam >>list.$pid ; done
done
}
output_mbox_entries()
{
file=""
{
while read classID fnam offset; do
if [ "$fnam" = "$file" ]; then
let length=$offset-$oldoff
if [ $length -gt 0 ] ; then
echo "$classID $fnam $oldoff $length"
fi
oldoff=$offset
else
file=$fnam
oldoff=0
fi
done
} < list.$pid
}
output_maildir_entries()
{
cat list.$pid
}
output()
{
if [ "$mode" = "$mbox" ] ; then # mbox
output_mbox_entries
else # maildir
output_maildir_entries
fi
}
doit "$@"