Blob Blame History Raw
#! /bin/sh

. ${srcdir:=.}/t.frame

#
# test common encoding related capabilities, e.g. 
# registering messages and database maintenance
#

# initialize
rm -r "$TMPDIR"/* # reset TXN detection
if [ $DB_TXN = true ] ; then TXN=--db-transaction=yes ; else TXN= ; fi

MSG="msg.accents"

cat <<_EOF > "$TMPDIR"/$MSG.txt
Subject: UTF-8 testing
Content-Type: text/html; charset=iso-8859-1

Accented characters:
    â château
    é Cédric
    í Cecília
    ó escribió
    ü Düsseldorf

Special Characters:
    0x92-’-windows-apostrophe
    0x93-“-windows-left-quote
    0x94-”-windows-right-quote
    0xA0- -no-break-space
    0xA1-¡-inverted-exclamation-mark
    0xA2-¢-cent-sign
    0xA3-£-pound-sign
    0xA4-¤-yen-sign
    0xA7-§-section-sign
    0xA9-©-copyright-sign
    0xAA-ª-feminine-ordinal-indicator
    0xAB-«-left-pointing-double-angle-quotation-mark
    0xAC-¬-not-sign
    0xAD-­-soft-hyphen
    0xAE-®-registered-sign
    0xAF-¯-macron
    0xB0-°-degree-sign
    0xB1-±-plus-minus-sign
    0xB2-²-superscript-two
    0xB3-³-superscript-three
    0xB5-µ-micro-sign
    0xB6-¶-pilcrow-sign
    0xB7-·-middle-dot
    0xB9-¹-superscript-one
    0xBA-º-masculine-ordinal-indicator
    0xBB-»-right-pointing-double-angle-quotation-mark
    0xBC-¼-inverted-question-mark
    0xD7-×-multiplication-sign
    0xF7-÷-division-sign
_EOF

DATE=`date +%Y%m%d`
BOGODIR="$TMPDIR"/wordlist
WORDLIST="$TMPDIR"/wordlist/wordlist.$DB_EXT

[ ! -d "$BOGODIR" ] && mkdir "$BOGODIR"

# create and dump wordlists with/without unicode
#   without uses iso-8859-1
#   with    uses utf-8
for YN in yes no ; do
    rm -fr "$BOGODIR"/*
    $BOGOFILTER -C -y 0 -n -d "$BOGODIR"  --unicode=$YN < "$TMPDIR"/$MSG.txt
    $BOGOUTIL   -C -y 0    -d "$WORDLIST" | sort > "$TMPDIR"/wordlist.$YN.txt
done

$BOGOUTIL   -C -y 0 -d "$WORDLIST" | sort > "$TMPDIR"/wordlist.raw.txt

iconv -f iso-8859-1 -t utf-8 <"$TMPDIR"/wordlist.raw.txt | \
sed "s/.ENCODING 1/.ENCODING 2/" | sort > "$TMPDIR"/wordlist.iconv.txt

# convert from iso-8859-1 to unicode
$BOGOUTIL -C -y 0 -m "$WORDLIST" --unicode=yes
$BOGOUTIL -C -y 0 -d "$WORDLIST" | sort > "$TMPDIR"/wordlist.new.txt

# convert from unicode to iso-8859-1
$BOGOUTIL -C -y 0 -m "$WORDLIST" --unicode=no
$BOGOUTIL -C -y 0 -d "$WORDLIST" | sort > "$TMPDIR"/wordlist.old.txt

echo "This is a test." > "$TMPDIR"/empty.txt
if ! cksum > "$TMPDIR"/empty.sum "$TMPDIR"/empty.txt ; then
    echo >&2 'cksum utility not found'
    exit 77
fi
tr '	' ' ' <"$TMPDIR"/empty.sum | cut -d " " -f 1-2 > "$TMPDIR"/empty.sumonly
if ! echo "2711662207 16" | cmp -s - "$TMPDIR"/empty.sumonly
then
    echo >&2 'cksum utility not POSIX compliant!'
    exit 1
fi

cat <<EOF | sed "s/ $DATE//" > "$TMPDIR"/cksum.ref
1059676362 909 wordlist.iconv.txt
1059676362 909 wordlist.new.txt
3303802408 880 wordlist.no.txt
3303802408 880 wordlist.old.txt
3303802408 880 wordlist.raw.txt
1059676362 909 wordlist.yes.txt
EOF

for FILE in "$TMPDIR"/wordlist.*.txt ; do
    cksum "$FILE" | $AWK '{ printf "%s %s ", $1, $2 }' >> "$TMPDIR"/cksum.out
    basename "$FILE" >> "$TMPDIR"/cksum.out
done

if [ $verbose -eq 0 ] ; then 
    cmp "$TMPDIR"/cksum.ref "$TMPDIR"/cksum.out
else
    diff -s "$TMPDIR"/cksum.ref "$TMPDIR"/cksum.out
fi