Blame stats

Packit ca9683
#!/bin/sh
Packit ca9683
unset LANG LC_CTYPE LC_ALL LC_COLLATE
Packit ca9683
make
Packit ca9683
echo
Packit ca9683
Packit ca9683
echo "Statistics on input files:"
Packit ca9683
echo "--------------------------"
Packit ca9683
echo -n "wolig.dat: "
Packit ca9683
echo -n `grep " ע" wolig.dat | grep -vc "^#"`
Packit ca9683
echo -n " noun lines, "
Packit ca9683
echo -n `grep " ת" wolig.dat | grep -vc "^#"`
Packit ca9683
echo " adjective lines."
Packit ca9683
echo -n "woo.dat: "
Packit ca9683
echo -n `grep " פ" woo.dat | grep -vc "^#"`
Packit ca9683
echo " verb lines."
Packit ca9683
echo -n "shemp.dat: "
Packit ca9683
echo -n `grep " ע" shemp.dat | grep -vc "^#"`
Packit ca9683
echo " auto-generated gerunds."
Packit ca9683
echo -n "misc data lines:" `egrep -hcv "^[-#]|^$" extrawords.hif` "extrawords, "
Packit ca9683
echo -n `grep -hcv "^[-#]" milot.hif` "milot, "
Packit ca9683
echo -n `grep -hcv "^[-#]" biza-verbs.hif` "bizaverbs, "
Packit ca9683
echo `grep -hc "^[-#]" biza-nouns.hif` "bizanouns. "
Packit ca9683
Packit ca9683
echo
Packit ca9683
Packit ca9683
echo "Unique baseword counts:"
Packit ca9683
echo "-----------------------"
Packit ca9683
NN=`grep -h " ע" wolig.dat shemp.dat | sed "/^#/d;s/ *#.*$//" | sort -u | wc -l`
Packit ca9683
NN1=`grep -h " ע" wolig.dat shemp.dat | sed "/^#/d;s/ *#.*$//" | sort -u | grep -vc "ע$"`
Packit ca9683
NN2=`sed "s/#.*$//" < wolig.dat | egrep ",(זכר|נקבה)" | grep "ע,"| wc -l`
Packit ca9683
NN3=`grep -h " ע" wolig.dat shemp.dat | sed "/^#/d;s/ *#.*$//" | sort -u | egrep ",(יחיד|רבים|ות|ים|יים|אות)" |wc -l`
Packit ca9683
echo Nouns: $NN "(of" them, $NN3 need plural hints, $NN1 need inflection hints, $NN2 explicit "gender)."
Packit ca9683
Packit ca9683
VV=`grep -c -- ---- verbs.hif`
Packit ca9683
echo Verbs: $VV
Packit ca9683
Packit ca9683
AA=`grep " ת" wolig.dat | grep -v "^#" | sed "s/ *#.*$//" | sort -u | wc -l`
Packit ca9683
echo Adjectives: $AA
Packit ca9683
Packit ca9683
EE=`grep -hv "^[-#]" extrawords.hif milot.hif biza-verbs.hif biza-nouns.hif |  sed "s/ *#.*$//" | tr -d - | sort -u | wc -l`
Packit ca9683
echo Other words: $EE
Packit ca9683
echo
Packit ca9683
echo Total number of base words - `expr $NN + $VV + $AA + $EE`
Packit ca9683
Packit ca9683
echo
Packit ca9683
echo "Final word count:"
Packit ca9683
echo "-----------------"
Packit ca9683
Packit ca9683
# we can count words in hebrew.wgz even without compiling wunzip :)
Packit ca9683
WW=`zcat hebrew.wgz | tr [0-9] '\012' | grep -vc "^$"`
Packit ca9683
echo Unique words in hebrew.wgz: $WW
Packit ca9683
echo "Dictionary file sizes (in bytes):"
Packit ca9683
wc -c hebrew.wgz*
Packit ca9683
echo "Memory use (spell-checker only):"
Packit ca9683
gzip -dc hebrew.wgz | ./find_sizes >/dev/null
Packit ca9683
Packit ca9683
# NOTE: to find duplicates in wolig.dat:
Packit ca9683
# grep " ע" wolig.dat | grep -v "^#"| sed "s/ *#.*$//"|sort |uniq -c | sort -n | less