Tree - source-git/hunspell-qu - CentOS Git server

source-git / hunspell-qu

Files

Blob Blame History Raw
#!/bin/bash
# ----------------------------------------------------------------------------------
#
#    Shukllachiska Kichwa del Ecuador, hunspell format dictionary generator
#    Copyright (C) 2009 Arno Teigseth, Henry David Lara
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program. See LICENSE.txt. If not, see <http://www.gnu.org/licenses/>.
#
#    The author(s) can be contacted at arno at teigseth dot no
#
# ----------------------------------------------------------------------------------
# 
# This is *NOT* the .dic file, but a script to generate such a file, taking the
# words from qu_EC.dic.MASTER. See the file qu_EC.dic.MASTER for dictionary creation rules.
#
# Generate qu_EC.dic file

# Variables
TMPFILE=/tmp/quiwork
VFILE=/tmp/quiworkVerbs
AFILE=/tmp/quiworkADJs
OFILE=/tmp/quiworkOther
OUTFILE=qu_EC.dic
MASTERFILE=qu_EC.dic.MASTER

#Functions
 
 fixtilde () # Create names entries
 {
   # Check if entry contains accents
   if [ `echo $1 |grep [áóéúí]` ]; then 
      # Does contain accents, print both versions
      # Print non-compound-version
      echo -n $1 |sed 's/\/.*//'
      echo /
      # Print compound-version
      echo -n $1 | sed y/áóéúí/aoeui/ |sed 's/\/.*//'
      echo //x
   else
      # No accents, just print the entry
      echo $1
   fi  
 }

 fixname () # Create names entries
 {
   # Check if entry contains accents
   if [ `echo $1 |grep [áóéúí]` ]; then 
      # Does contain accents, print both versions
      # Print non-compound-version
      echo $1/
      # Print compound-version
      echo -n $1 | sed y/áóéúí/aoeui/
      echo //x
   else
      # No accents, just print the entry
      echo $1//
   fi  
 }


 variant () # Create infix variants
 {
   VERB=`echo $1 |sed -e 's/\/\/.*$//'`  # Remove trailing // and whatever.

   # OUT Base verb -na
   echo ${VERB}//v
   
   STEM=`echo $VERB |sed s/na$//`      # Stem without -na
   RULES=`echo $1 |sed -e 's/.*\/\///'` # Remove stem and //
   
   # OUT -ri- infix and variants
   if [[ $RULES == *r* ]] 
   then 
      echo ${STEM}rina//v

      # OUT -rimu- infix
	   if [[ $RULES == *,* ]] 	
	   then 
	      echo ${STEM}rimuna//v
	   fi

      # OUT -richi- infix
      if [[ $RULES == *h* ]] 
      then 
         echo ${STEM}richina//v
      fi

      # OUT -rikri- infix
      # 2sam 10:10 "tuparikrichun"
      if [[ $RULES == *\>* ]] 
      then 
         echo ${STEM}rikrina//v
      fi

      # OUT -riku- infix and variants
      if [[ $RULES == *-* ]] 
      then 
         echo ${STEM}rikuna//v
      fi

   fi

   # OUT -kri- infix and variants
   if [[ $RULES == *\>* ]] 
   then 
      echo ${STEM}krina//v

   fi

   # OUT -ra- infix and variants (-ra- and -riya- are equivalent)
   if [[ $RULES == *+* ]] 
   then 
      echo ${STEM}rana//v
      echo ${STEM}riyana//v
   fi

   # OUT -ku- infix and variants
   if [[ $RULES == *-* ]] 
   then 
      echo ${STEM}kuna//v
      echo ${STEM}nakuna//v

      # OUT -kumu- infix and variants
      if [[ $RULES == *,* ]] 
      then 
         echo ${STEM}kumuna//v

         # OUT -kumuri- infix and variants
         # (my lesson 77: "uraykumurirkakunachu")
         if [[ $RULES == *r* ]] 
         then 
            echo ${STEM}kumurina//v
         fi
      fi

      # OUT -kuchi- infix
      if [[ $RULES == *h* ]] 
      then 
         echo ${STEM}kuchina//v
      fi

   fi

   # OUT -mu- infix and variants
   if [[ $RULES == *,* ]] 
   then 
      echo ${STEM}muna//v

      # OUT -muwa- infix
      if [[ $RULES == *w* ]] 
      then 
         echo ${STEM}muwana//v
      fi

      # OUT -muku- infix
	   if [[ $RULES == *-* ]] 
	   then 
	      echo ${STEM}mukuna//v
	      echo ${STEM}namukuna//v
	   fi
   fi

   # OUT -wa- infix and variants
   if [[ $RULES == *w* ]] 
   then 
      echo ${STEM}wana//v
   fi

   # OUT -chi- infix and variants
   if [[ $RULES == *h* ]] 
   then 
      echo ${STEM}china//v

      # OUT -chiwa- infix
      if [[ $RULES == *w* ]] 
      then 
         echo ${STEM}chiwana//v
      fi

      # OUT -chiri- infix
      # 2sam 10:8 "alli|chiri|nakurka"
      if [[ $RULES == *r* ]] 
      then 
         echo ${STEM}chirina//v
      fi

      # OUT -chiku- infix
      if [[ $RULES == *-* ]] 
      then 
         echo ${STEM}chikuna//v
         echo ${STEM}chinakuna//v
      fi

     # OUT -chikri- infix and variants
     if [[ $RULES == *\>* ]] 
     then 
        echo ${STEM}chikrina//v

     fi

   fi
 }

## END OF FUNCTIONS


#
rm $OFILE

#  Prep: remove comments and rtrim whitespace
cat $MASTERFILE |sed s/#.*$// |sed s/" "*$// |grep -v ^$ > $TMPFILE 


# Find verbs
echo Forking verbs
cat $TMPFILE | grep v$ > $VFILE

# Find adjectives
echo Forking adjectives
# "ungido" is an adjective, but the corresponding verb is
# "ungina", NOT "ungidoyana"
#
# NOTE using MASTERFILE not TEMPFILE here
cat $MASTERFILE |grep '\/a'|grep -v '#NOCONJ' |sed s/#.*$// |sed s/" "*$// |grep -v ^$ | grep a$ > $AFILE

echo Adding adjectives -yana form to verbs file
cat $AFILE |sed 's/\/.*$/yana\/\/r\>+-,whv/' >> $VFILE

# Add alli->alliYANA
#cat $AFILE |sed 's/\/.*$//'|grep -v k$ |sed 's/$/yana\/\/r\>+-,whv/' >> $VFILE

# achik->achiYANA (not achiKyana)
#cat $AFILE |sed 's/\/.*$//'|grep k$ |sed 's/k$/yana\/\/r\>+-,whv/' >> $VFILE


#echo Adding adjectives -naya form to dic file
#cat $AFILE |sed 's/\/.*$/naya\/\//' > $OFILE
#
# -naya is really for nouns:
# yakunaya = thirsty


# Find other words
echo 'Writing non-[verb/adjective]s'

for n in `cat $TMPFILE|grep -v v$ |grep -v a$`; do
 # The words from MASTER ending in /x should really go out here

 #echo Adding word $n
 echo -n "."
 fixtilde $n >> $OFILE
done


# used to be just
# cat $TMPFILE | grep -v v$ >> $OFILE
# but cárcel should be added as two:
# cárcel/
# carcel//x



# Prep outfile 
rm $TMPFILE

# Put infixes onto verbs, into outfile.
for i in `cat $VFILE`; do
 echo Creating variants of $i
 variant $i >> $TMPFILE; # Create variants of verb, according to the rules
done

# Append names
for n in `cat names.txt`; do
 echo Adding name $n
 fixtilde $n >> $TMPFILE
done

# Create outfile
cat $OFILE >> $TMPFILE

# Count it
cat $TMPFILE |sort -u |grep -c $ > $OUTFILE

# Sort it, removing duplicates
cat $TMPFILE |sort -u >> $OUTFILE


#Clean up
#rm $TMPFILE
#rm $VFILE
#rm $AFILE
#rm $OFILE
source-git / hunspell-qu

Source Code

Files