diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..391681b --- /dev/null +++ b/NEWS @@ -0,0 +1,482 @@ +Changes in the Norwegian spell checking package + +Release 2.0.10 (2008-03-XX) + + * Added . after every synonym and put them all into synonymer-nb.txt and + not in norsk.words, from where they are removed. + --Moved: bm, bnr, cand, dvs, etc, ev, mag, osv, pga + --Changed: 'eg * 21 B. to 'eg *' (to only use it in nynorsk) + 'mm *' to 'mm * 0 B' (to use it also in bokm�l) + --Removed mao and phil + --Removed from forkort-nb.txt: + Ph.D., S.u., a.a., a.a.C.a., a.m., c.c., cand.occon., cand.rer.polit., + f.o., fr.o.m., f�r., f�re., h.o.h., h.t., i.l., k.o., m.a., mu.h., + o.dyl., pr.pr., r.p., res.kap., s.st., s.�., sq.in., stud.rer.polit., + v.hj.a., z.B., + --Added in forkort-nb.txt: + A.C., KrF., Pb., Ph.D., S.s.v., S.u., a.a.C.n., adm., adr., am., bill., + bl., bm., cand.hort., cand.oecon., cand.polit. cand.san., cand.scient., + cand.sociol, dir., disp., div., dr., dr.art., dr.oecon., dr.scient., + dvs., eg., el., etc., ev., fil., fm., forb., forf., fork., forl., forr., + forts., fr., gl., gram., hoh., ill., jr., kl., kr., lat., laud., lev., + lign., likn., litt., l�., ma., mag., maks., mat., mat.nat., mfl., muh., + mus., mva., nat., nr., n�dv., obj., obl., obs., omarb., omg., omtr., on., + oppr., org., orig., osv., pga., poet., pol., pr., priv., prod., prof., + pron., prot., psyk., pt., q.v., rel., s.�., siviling., sms., sos., + sos.dem., sovj., sst., st., stip., stud., stud.san., subj., subst., s�k., + s�., tekn., teol., ti., tlf., to., ub., ubest., ug., univ., utg., utt., + uttr., v.v., vgs., vha., vs., �rg., �rh., �rl., �v., �.l., �kol., �kon ., + + * Add aspell-phonect.dat from Olaf Havnes. Created initial install rules + for it. The install rules need to be checked. + * Added forms rykker-brevs, rykker-brevene, rykker-brevenes, and marked + all forms of this word as K (conservative) (nb) + * Added word separator for all words containing Inter-nett. Removed + dupliate entry for 'Internett'. Did similar for av-montere, + av-montering, frem-over, f�re-setnad, inne-held, inne-halde. + * Removed dupliate entry for av-leie, ignorerande, nettverks-tenesta, + nettverks-teneste, nettverks-tenestene, nettverks-tenester, + rose-maler, skole-medisiner, stor-spiller, tjukkas. + * Imported thesarus for bokm�l from synonymer.merg.net. + * Add rules to build thesarus for OOo v2. + * Updated dictionary server used by the bokmaal script from + www.dokpro-test.uio.no to www.dokpro.uio.no. + * Added 623500 new words from no.speling.org, most of them imported + to no.speling.org from the norwegian project ordbanken. + +Release 2.0.9 (2007-02-19) + + * Updated nb frequency information for words with zero as their + frequency value based on info received from Kevin Patrick Scannell + and the An Cr�bad�n project. + * Add two make targets speling-new.nb and speling-new.nn to make + lists of new words based on the no.speling.org database. + * Updated the nb and nn thesaurus from the no.speling.org system. + * Correct handling of " when making aspell dictionary. + * Add combined word marker (-) for words including s�r-kull. + * Rename forkort.txt to forkort-nb.txt to make it more obvious how to + make a similar file for nn. Created forkort-nn.txt and adjusted + build rules to use it. + * New words from the no.speling.org database: + - CD (nb,nn) + - CDane (nn) + - CDar (nn) + - DHCP (nb,nn) + - DHCP-vertsnamn (nn) + - Internett (nb,nn) + - LP (nb,nn) + - angrast (nn) + - annen (nn) + - arv-tager (nb) + - automat-pistol (nb) + - avleia (nn) + - avleie (nn) + - avmontere (nb,nn) + - avmonterer (nb,nn) + - avmontering (nb,nn) + - avslutt (nn) + - bilde-samling (nb) + - bygge (nn) + - byt (nn) + - defekte (nb) + - dela (nn) + - drifts-huset (nb) + - drivar (nn) + - drivar-diskett (nn) + - drivar-diskettar (nn) + - drivar-disketten (nn) + - drivarane (nn) + - drivarar (nn) + - drivaren (nn) + - dyre-elskere (nb) + - d�mer (nn) + - empati (nb) + - est (nb) + - etanol (nb) + - fag-felle-vurdert (nb,nn), fag-felle-vurdering (nb,nn) + - ferie-bilen (nb) + - fil-nedlasting (nb,nn) + - fil-nummer (nn) + - fil-nummeret (nn) + - fil-omr�da (nn) + - fil-omr�de (nn) + - fil-omr�der (nn) + - fil-omr�det (nn) + - fil-system (nn) + - fil-system-blokk (nb,nn) + - fil-system-blokka (nn) + - fil-system-blokkene (nb,nn) + - fil-system-parameter (nb,nn) + - fil-system-tre (nb,nn) + - fil-system-treet (nb,nn) + - fil-system-type (nb,nn) + - fil-system-typen (nb,nn) + - fil-systema (nn) + - fil-systemer (nn) + - fil-systemet (nn) + - fil-tilgang (nn) + - fil-tilgangen (nn) + - finans-departement (nb), finans-departementet (nb) + - finna (nn) + - flikk-arbeid (nb) + - formatera (nn) + - forsknings-departementet (nb) + - fritere (nb) + - fungera (nn) + - f�rehands-oppsetts-fila (nn) + - f�resetnad (nn) + - f�resetnadane (nn) + - f�resetnadar (nn) + - f�resetnaden (nn) + - gjentakande (nn) + - gjentake (nn) + - greit (nn) + - halda (nn) + - haldar (nn) + - haldarar (nn) + - haldaren (nn) + - halo (nb) + - handterar (nn) + - handterarane (nn) + - handterarar (nn) + - handteraren (nn) + - hopp-mesterskap (nb), hopp-mesterskapet (nb) + - hovud-distribusjon (nn) + - hovud-distribusjonar (nn) + - hovud-distribusjonen (nn) + - hovud-oppsett (nn) + - hovud-oppsetta (nn) + - hovud-oppsettet (nn) + - hovud-oppsetts-fil (nn) + - hovud-oppsetts-fila (nn) + - hovud-oppsetts-filene (nn) + - hovud-oppsetts-filer (nn) + - hovud-oppstarts-spor (nn) + - hovud-oppstarts-spora (nn) + - hovud-oppstarts-sporet (nn) + - hus-bil (nb), hus-bilen (nb), hus-bilene (nb) + - hukre (nb) + - hvile-stedene (nb) + - idretts-skyttere (nb) + - ignorer (nn) + - ignorerande (nn) + - inert-gass (nb) + - inert-gassen (nb) + - inert-gassene (nb) + - inert-gassenes (nb) + - inert-gassens (nb) + - inert-gasser (nb) + - innehalda (nn) + - innehalde (nn) + - innehaldt (nn) + - inneheld (nn) + - innkj�ps-ansvarlig (nb) + - innovativt (nb) + - innskudds-kontoer (nb) + - installasjons-CD (nb,nn) + - installasjons-CDar (nn) + - installasjons-CDen (nb,nn) + - installasjons-CDer (nb) + - installera (nn) + - installerbar (nb,nn) + - installerbare (nb,nn) + - investerings-foretaka (nb), investerings-foretakas (nb) + - ip-adresse (nb,nn) + - jemtlending (nb) + - j�vel (nb) + - kammer-musikk-arrangementet (nb) + - kjerna (nn) + - kjerne-modul-pakka (nb,nn) + - kjerne-modul-pakkar (nn) + - kjerne-modul-pakke (nb,nn) + - kjerne-modul-pakker (nb,nn) + - kj�pe-videoene (nb), kj�pe-videoenes (nb), kj�pe-videoer (nb), kj�pe-videoers (nb), kj�pe-videos (nb) + - kontroll-tids-punkt (nb), kontroll-tids-punktet (nb) + - kontrollera (nn) + - koordinators (nb) + - kopier (nn) + - kopiera (nn) + - kring-kastings-nettene (nb) + - kvart�rs (nn) + - leggja (nn) + - leggjast (nn) + - leggje (nn) + - lesa (nn) + - ligge (nn) + - linux (nb,nn) + - lisens-kostnad (nb), lisens-kostnaden (nb), lisens-kostnadene (nb), lisens-kostnader (nb) + - lyn-melding (nb), lyn-meldingen (nb), lyn-meldingene (nb), lyn-meldinger (nb) + - l�s-fart (nb) + - l�ysa (nn) + - mega (nn) + - mega-byte (nn) + - mega-hertz (nn) + - mega-konsert (nn) + - mega-konserten (nn) + - mellom-lager (nn) + - mellom-lageret (nn) + - meter-bylgja (nn) + - meter-bylgje (nn) + - meter-bylgjene (nn) + - meter-bylgjer (nn) + - mikrokode-niv� (nb,nn) + - mikrokode-niv�a (nb,nn) + - mikrokode-niv�ene (nb) + - mikrokode-niv�et (nb,nn) + - milit�r-drakt (nb) + - montera (nn) + - multidisk-eining (nn), multidisk-eininga (nn), multidisk-einingane (nn), multidisk-einingar (nn) + - m�lbarhet (nb) + - ned-rustnings-meldinga (nb) + - nede-tid (nb), nede-tiden (nb) + - nettenar (nn), nettenarane (nn), nettenarar (nn), nettenaren (nn) + - nettverks-konsoll (nb,nn) + - nettverks-konsollane (nn) + - nettverks-konsollar (nn) + - nettverks-konsollen (nb,nn) + - nettverks-konsollene (nb) + - nettverks-konsoller (nb) + - nettverks-tenesta (nn) + - nettverks-teneste (nn) + - nettverks-tenestene (nn) + - nettverks-tenester (nn) + - notasjon (nn), notasjonane (nn), notasjonar (nn), notasjonen (nn) + - n�kkel-ID (nb,nn) + - n�kkel-IDane (nn) + - n�kkel-IDar (nn) + - n�kkel-IDen (nb,nn) + - n�kkel-IDene (nb) + - n�kkel-IDer (nb) + - oppdater (nn) + - opprinnelses-land (nb) + - oppsto (nn) + - parti-sammenhengen (nb), parti-sammenhengene (nb), parti-sammenhengenes (nb), parti-sammenhengens (nb), parti-sammenhenger (nb), parti-sammenhengers (nb), parti-sammenhengs (nb) + - pass� (nb) + - penge-sjefen (nb) + - penge-tap (nb) + - pris-redusert (nb), pris-reduserte (nb) + - produkt-ansvarlig (nb) + - propriet�re (nb) + - p�-kravd (nn), p�-kravde (nn) + - redusera (nn) + - reise-bilen (nb) + - respons-tider (nb) + - sammen-filtring (nb) + - senda (nn) + - setja (nn) + - siffera (nn) + - sjekksum-fil (nb,nn), sjekksum-fila (nb,nn) + - ski-anlegg (nb), ski-anlegget (nb) + - slettast (nn) + - sokalla (nn) + - steg-vis (nb) + - str�m-kundenes (nb), str�m-kunders (nb), str�m-kundes (nb) + - suge-kamrene (nb) + - skjedenes (nb), skjeders (nb), skjedes (nb) + - t.d. (nn) + - test-program (nb) + - trenga (nn) + - tross (nn) + - trykkja (nn) + - ulvinne (nb) + - uoppretta (nn) + - ut-rulling (nb) + - utdata (nn) + - utkommentere (nb,nn), utkommenterer (nb,nn), utkommenterter (nb,nn) + - utl�ns-virksomheta (nb), utl�ns-virksomhetas (nb), utl�ns-virksomhetene (nb), utl�ns-virksomhetenes (nb), utl�ns-virksomheter (nb), utl�ns-virksomheters (nb), utl�ns-virksomhets (nb) + - utstr�la (nn) + - variera (nn) + - vei-viserne (nb) + - ven (nb) + - verdens-makta (nb) + - verdens-systemet (nb) + - verifisera (nn) + - versjons-kontroll (nb) + - vinsjenes (nb), vinsjers (nb), vinsjs (nb) + - �dsleriet (nb) + - �ydeleggja (nn) + + * Removed words + - leggte (nn) + - leggtja (nn) + - leggtjst (nn) + +Release 2.0.8 (2006-06-30) + + * Found out how to make the OOo-wizard in OOo 1.x find the thesaurus + files for nynorsk and bokm�l and install them properly. + OOo 2.0 will still only find the bokm�l package, because this is a + separate file in another format. + * Made some minor improvements on the documentation files. + +Release 2.0.7 (2006-06-30) + + * Added a beta version of a bokm�l thesarus for OOo 1.x and 2.x, + downloaded from . + * Translated the README-files for the thesarus-packages to norwegian. + * Added a very small nn thesaurus generated with data from the + no.speling.org system (just a alfa version, mostly for testing :-). + * Removed filter in Makefile for the source file from no.speling.org, + as the errors are fixed upstream. + * Updated the nb thesaurus from the no.speling.org system. + * Lower the nb frequency cutoff point from >0 to >=0, to get even + more words included in the spell check systems for nb. This + increases the number of nb from 352055 to 545714. I believe this + will include all nb words. + + * New words: + - m�le-l�p (nb,nn), m�le-l�pa (nn), m�le-l�pene (nb), m�le-l�penes (nb), + m�le-l�pet (nn), m�le-l�pets (nb, nn). Frequence 0 as none of + these words are listed in the word frequency lists we have access + to. Thanks to Jens Blix for reporting this missing word. + - i (nb,nn). Frequence set to 31 as it is a very common word. + Thanks to Lars Oftedal for discovering this missing word. + - �kse-morder (nb), �kse-mordere (nb), �kse-morderen (nb), + �kse-morderens (nb), �kse-morderes (nb), �kse-morderne (nb), + �kse-mordernes (nb), �kse-morders (nb) �kse-mordene (nb), + �kse-mordenes (nb). Freq 0 until updated with real info. Thanks + to Lars Oftedal for discovering that �ksemorder was missing. + + * Changed words: + - Allow �kse-mord and �kse-mordet for nb as well. Freq 0 for now. + - Made '�' into a valid nb word (as well as the existing nn + setting), and set the frequency info to 31 as it is a common + word. + +Release 2.0.6 (2006-02-24) + + * Update the nb thesaurus from the no.speling.org system. + * Rewrite myspell and OOo install rule for thesaurus to use 'th_' + prefix on the files. + +Release 2.0.5 (2006-02-19) + + * Drop radical samnorsk 'S' class words from the nb dictionary. + * Update nohyphb.tex with the improved version nohyphbx.tex from + . + * Update the new nohyphb.tex to handle 8bit chars consistently, and + make it easier to transform automatically. + * Add build rule to generate OOo-formattet hyphernation format from + nohyphb.tex. Include this generated file in the OOo-package + instead of the files in ooo-hyph/. + * Update the thesaurus from the no.speling.org system. + +Release 2.0.4 (2006-02-16) + + * Generate the thesaurus from the no.speling.org system, instead + of maintaining the list manually. Only include non-controversial + words. All the prevoious synonyms are now available from there. + * Rewrite myspell build to use the munched words from the ispell + build, to reduce the build time. + * Added build rule 'ooo-dist' to make a OpenOffice.org spellcheck + package. + * New words: + - sol-ur (nb,nn), sol-urene (nb,nn), sol-uret (nb,nn), sol-urets (nb,nn) + +Release 2.0.3 (2006-01-15) + + * Update character frequency information used by myspell, and copy a + few AFF lines from the OOo spell-checking package. + * Updated nb frequency information for words with zero as their + frequency value based on info received from Kevin Patrick Scannell + and the An Cr�bad�n project. This added 583 words to the nb spell + checker. + * Rewrite myspell build rules to use the tools from the myspell + package. Now uses 'munch' the full list of nb and nn words to + generate the dictionary files and 'ispellaff2myspell' from the + myspell package instead of the home grown 'iaff2myaff.pl' to + convert the affix file to myspell format. + * Correct aspell build rules to make sure the {nb,nn}.dat files are + available. + * Try to optimize the ispell build rules by moving more filtering + into sed. + * Use makefile variables for most word separator filtering, to make + it easier to switch separator character in the future. + * Added several new words in the nb thesaurus extracted from the free + nb word database. being compiled on . + * Comment out altstringchar rules for iso246 in nb.aff.in and + nn.aff.in, as it confuses munchfile in ispell 3.3.02. + + * New words + - Alexandras (nb,nn). + - regional-departement (nb), regional-departementet (nb) and + regional-departementets (nb). + - fast-lege (nb,nn), fast-legen (nb,nn), fast-legens (nb). + +Release 2.0.2 (2006-01-04) + + * Corrected myspell dict file count line. + * Made it easier to replace 'echo -e' for platforms where -e is not a + valid option to echo. + * Added 'install-doc' target to install documentation files. + * Install ispell dictionaries using 'nb' and 'nn' names, and make + symlinks to these from the old names. + * Add script and make rule 'freq-update' to update the frequency + information based on data from NTA, . + + * New words: + - fremover (nb). + - Internet (nb,nn). + - internettet (nb). + - s�r-emne (nb). + - vassdrags-tiltak (nb). + + * Changed words: + - Updated lots of words with freq 0 to the freq value provided from + NTA. This added 9787 words to the nb list. + - Update frequency information for all new words in 2.0.1. Set to + '1' for words not available from NTA, to make sure they are + included in the nb dictionary. + +Release 2.0.1 (2005-12-31) + + * Now being group maintained on Alioth. + * Updated package to use new email address for Rune Kleveland. + * Rewrote build rules based on Debian patches, to make it easier to + make binary packages based on this source. + * Rewrite build rules to use the language codes 'nb' and 'nn' + instead of 'norsk' and 'nynorsk'. + * Added build rules for aspell and myspell, based on the rules + in the debian package. + * Started on myspell (OOo) thesaurus files for bokm�l (nb). + * Added new script 'bokmaal', capable of looking up words on the web + service available from . + * Lower the nb frequency cutoff point from >9 to >0, to get more + words included in the spell check systems for nb. + + * New words: + - DVD (nb,nn). + - fil-rettighet (nb), fil-rettigheten (nb), fil-rettigheter (nb) + - ignoranse (nb). + - Internett (nb,nn), internett-* (nb,nn). + - internett-leverand�r (nb,nn), internett-leverand�ren (nb), + internett-leverand�rer (nb). + - kontrakts-forslag (nb), kontrakts-forslaget (nb). + - krypto (nb). + - navne-tjener (nb), navne-tjenere (nb), navne-tjeneren (nb). + - Reinholdtsen (nb,nn). + - sikkerhets-oppdatering (nb), sikkerhets-oppdateringen (nb), + sikkerhets-oppdateringene (nb). + - Skolelinux (nb), Skulelinux (nn). + - Skolelinuxprosjektet (nb), Skulelinuxprosjektet (nn). + * Changed words: + - Internett-adressene: freq ""->2, to make it visible as a nb word. + - Linux: freq 0->2 + - Linux-*: freq 0->2 + - rekursiv: req 0->2 + +Release 2.0 (2000-09-12) + + [ NEWS file did not exist then ] + +Release 1.1a (1998-07-06) + + [ NEWS file did not exist then ] + +Release 1.1 (1998-06-03) + + [ NEWS file did not exist then ] + +Release 1.0 (1998-05-20) + + [ NEWS file did not exist then ] diff --git a/README b/README new file mode 100644 index 0000000..e1db9c2 --- /dev/null +++ b/README @@ -0,0 +1,569 @@ +README-file for the distribution of the Norwegian dictionaries for ISPELL. + +DESCRIPTION + +This distribution contains a big collection of Norwegian words (both +bokm�l and nynorsk) and support files to make useful things from it. + +The main file norsk.source contains 747500 words from the Norwegian +language. Each word has a commonness indicator, and it is hyphenated +at compound points. + +There is also a Makefile to assist in building dictionaries for Ispell +and other word processors, using a sensible subset of the available +words. There is also a Makefile in the patterns directory which makes +hyphenation patterns for TeX based on the dictionary and a simple set +of hyphenation patterns that works on non-compound words. + +The latest version is available at + +http://spell-norwegian.alioth.debian.org/ + +Comments, suggestions and bug-reports to i18n-no@lister.ping.uio.no. + +There is also a slashdot project with a similar goal. We should try to +join forces with them. + + + +BUILDING A NORWEGIAN ISPELL DICTIONARY + +* Get the ispell sources and unpack it. + + cd /source + tar -zxvf ispell-3.1.20.tar.gz + + You can also unpack the sources for the Norwegian dictionary now: + + cd ispell-3.1/languages + tar -zxvf ispell-norsk-2.0.tar.gz + +* Patch Ispell + + I have made a patch for ispell based mainly on other patches found + on the net. If you think you have found a bug in ispell, please + make sure that it has nothing to do with this patch before + reporting it to the ispell manager! + + The following things are done: + + 1. An attempt is made to fix the backslash bug. The patch for this + was found at Ken Stevens ispell.el site. + + 2. Ispell can now parse html files thanks to a patch by Gerry + Tierney. Basically this means that a patched copy of ispell will + ignore any mark-up tags or html entities in a html document when + spell checking that document. Any text inside an 'alt' attribute + will however be checked. + + Examples: ispell index.html # html tags will be ignored + ispell -h README # html tags will be ignored + ispell -n index.html # html tags will be spell-checked + + I have not been able to make the html mode work well when using + ispell from emacs. That doesn't matter too much, since ispell.el + has its own skipping mechanism. + + 3. Buildhash now accepts all characters between A and z as flags, + not only the alphanumeric ones when MASKBITS=64. This is needed + by the Norwegian affix file. + + 4. The AMS and breqn math environments are now skipped by ispell. + + 5. Ispell gets the ability to suggest "- as a separation character + in addition to - and space. This only happens if such support is + compiled in, e.g. the COMPOUNDBABEL flag must be defined, and it + only happens in TeX mode and if the language is norsk. It is + useful to mark compound points in words to ensure good + hyphenation when using LaTeX with Babel. The Norwegian + hyphenation patterns distributed in this package hyphenate almost + every word in the Ispell dictionary correctly, but no guaranty is + offered for other compound words. + + 6. Added an -r switch, which is almost like the -a switch, but the + suggestions are printed even if the word is found in the + dictionary. This is useful for hyphenating words and for + eliminating rare words close to very common words. There has to + be some german out there wanting to make TeX hyphenate only + compound words. + + 7. Added a patch from the Redhat rpm to avoid compilation error in + ijoin.c. + + So if you are feeling a little brave; + + cd ispell-3.1 + patch < languages/norsk/ispell-3.1.20.no.patch + + Additional patches might be needed on various systems. The Redhat + source RPM is a good place to look if something fails. + +* CONFIGURE ISPELL The file Config.X in the ispell-3.1 distribution + contains configuration information for ispell (no ./configure yet). + The definitions are overridden by those in the file local.h, for + which there is a local.h.samp. The following local.h works for me + on my Redhat-6.0 system. You have to adopt the file to those + languages you have dictionaries for. + +----------------------------------------------------------------------- +#define MINIMENU /* Display a mini-menu at the bottom of the screen */ +#define USG /* Define this on System V */ + +#define BINDIR "/usr/bin" +#define LIBDIR "/usr/lib" +#define MAN1DIR "/usr/man/man1" +#define MAN4DIR "/usr/man/man4" + +#define LANGUAGES "{american,MASTERDICTS=american.med+,HASHFILES=americanmed+.ha +sh,EXTRADICT=/usr/dict/words} {norsk}" +#define MASKBITS 64 +#define LOOK "look" +#define CFLAGS "-O3" /* Mostly to speed up my batch operations */ +#define LDFLAGS "-s" +#define COMPOUNDBABEL +----------------------------------------------------------------------- + + It might be wise to try to build ispell only for English, to test that + everything works, and add new languages afterwards. + + cd ispell-3.1 + make all + + This takes some time, but almost nothing compared to building the + Norwegian dictionary. + +* ADD LANGUAGES + + Get dictionaries for the languages you want to install from the + ispell home page. Unpack them in the appropriate directories. + Update the LANGUAGES variable in local.h and remake. + + Make sure that there is enough free space to build the dictionary. + If it isn't the build process will loose miserabely. About 120 MB is + needed! + + The Norwegian dictionary can be configured. You can choose which + categories of words to include, and how common a word has to be to + be included. This is documented in the Makefile in languages/norsk. + This flexibility has its price; it takes a very long time and a lot + of disk space to build the dictionary, up to 120Mb. + + You can also customize the affix file to remove or add some forms of + words. For example you could choose to allow or disallow the + spelling `komit�en'. To do this you can make the file norsk.aff, + edit it according to your needs, and make norsk.hash afterwards. + Look for the word `valgfritt' in the file. Bear in mind that + norsk.aff will is dependent on norsk.aff.in, so if you touch that + file your version will be overwritten. It will not work as expected + to change norsk.aff.in. + +* INSTALL + + Before you install, you might want to test if ispell works. + + cd languages/norsk + echo vurderingskriterier | ../../ispell -a -d norsk.hash + + should find vurderingskriterium. Then + + make install + + +USING THE DICTIONARY + +CHARACTER SETS + +By default ispell assumes you use latin-1 encoding in your Norwegian +files. To spell-check such a file you just say + +ispell -d norsk mythesis.tex + +In TeX you can use `{\aa}', `{\oe}', `{\o}', `\'e', `\'o' and `\^o' to +represent the special Norwegian characters. If you do this, you have +to say + +ispell -T plaintex -d norsk mythesis.tex + +to spell-check a file. The characters ������ will not be recognized +then, so unfortunately you have to choose one standard. If you use +`\aa{}' etc. instead, you should change the affix file or add a +similar entry in the affix file. + +In a plain ASCII file `� � �' are sometimes represented `ae oe aa'. +Use + +ispell -T ascii -d norsk mythesis.tex + +to spell-check such a file. + +The iso246 encoding puts ��� after z in the collating sequence. +If you use this encoding, say + +ispell -T iso246 -d norsk mythesis.tex + +Does anybody use this?? + + +COMPOUND WORDS + +The use of compound words is what makes it both fun and difficult to +produce a good and secure ispell dictionary and to make hyphenation +patterns for TeX. + +Ispell has two very important switches, -B and -C, controlling whether +ispell accepts words formed by a root and another word as correct. If +the -C flag is given, ispell will accept words as +`avdelingsbestyrerstilling', which is right, but also words as +`premierene' (premie-rene), which is wrong. It is *not recommended* +to use the -C option with the Norwegian dictionary, since far to many +incorrect spellings will be accepted. + +If you don't give the -B or -C flag, ispell will accept compound words +formed by a small subset of the words in the dictionary. The subset +depends on the configuration variables in the Makefile. This is called +controlled compoundwords mode. It is even more safe to give the -B +option, such that only words in the dictionary are regarded as +correct. I would do that if I had written something important. + +The hyphenation patterns for TeX are only tested on words in the +dictionary, so these patterns might fail on compound words accepted in +controlled compoundwords mode. If you want to be absolutely certain +that there will be no bad hyphens in your document, you have to use +the -B switch. See `The hyphenation problem' below. + + +FIGHTING `ORD DELINGS SYNDROMET' + +Most spell checkers, including ispell, suggest to split compound words +it doesn't find in its dictionary. If people follow these suggestions +blindly, the result is disaster; they get spelling errors in the +actual document and even worse; they think they have learned the +correct spelling! (arkitekt tegnet hus i Holmenkoll �sen...) + +I have done two things to fight this. Ispell suggests `"-' in +addition to `-' and ` ' for compound words, which tells TeX that here +is a compound point and makes the spell-check skip the word next time. + +The second thing is more important. The script inorsk-maybecompound +searches a document (or standard input) for two and three words +following each other that can be written in one word, hyphenates them +using TeX and prints the compound words to standard output. By +hyphenating one avoids words like sommer (som mer), forlenge (for +lenge) etc. Use it! + + +EMACS + +The version of `ispell.el' distributed with emacs-19.34 does not +support Norwegian. I suggest you get the latest ispell.el from +ftp://kdstevens.com/pub/stevens/ispell.el.gz. Good versions are also +found in emacs-20.[4567]. + +So make sure that your version of ispell.el uses the variable +ispell-local-dictionary-alist, and put a suitable subset of the +following in your .emacs file: + +(setq + ispell-local-dictionary-alist + '(("norsk" ; 8 bit Norwegian mode + "[A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[^A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[\".,;:]" t ("-B" "-S" "-d" "norsk") "~list" iso-8859-1) + ("norsk7-tex" ; 7 bit Norwegian TeX mode + "[A-Za-z{}\\'^`@]" "[^A-Za-z{}\\'^`@]" + "[\".,;:]" t ("-B" "-S" "-d" "norsk" "-T" "plaintex") "~plaintex" nil) + ("norsk7-html" ; 7 bit Norwegian html mode + "[A-Za-z\&;]" "[^A-Za-z\&;]" ; Don't use ispell's html-parser + "[.,:]" t ("-B" "-S" "-n" "-d" "norsk") "~html" iso-8859-1) + ("norsk7-ascii" ; 7 bit Norwegian (aa, ae, oe) + "[A-Za-z]" "[^A-Za-z]" + "[\".,;:]" t ("-B" "-S" "-d" "norsk") "~ascii" iso-8859-1) + ("norsk7-iso246" "[][A-Za-z{}|\\]" "[^][A-Za-z{}|\\]" + "[\".,;:]" nil ("-B" "-S" "-d" "norsk") "~iso246" iso-8859-1) + ("norsk-comp" ; 8 bit Norwegian mode + "[A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[^A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[\".,;:]" t ("-S" "-d" "norsk") "~list" iso-8859-1) + ("norsk7-tex-comp" ; 7 bit Norwegian TeX mode + "[A-Za-z{}\\'^`@]" "[^A-Za-z{}\\'^`@]" + "[\".,;:]" t ("-S" "-d" "norsk" "-T" "plaintex") "~plaintex" nil) + ("norsk7-html-comp" ; 7 bit Norwegian html mode + "[A-Za-z\&;]" "[^A-Za-z\&;]" ; Don't use ispell's html-parser + "[.,:]" t ("-S" "-n" "-d" "norsk") "~html" iso-8859-1) + ("norsk7-ascii-comp" ; 7 bit Norwegian (aa, ae, oe) + "[A-Za-z]" "[^A-Za-z]" + "[\".,;:]" t ("-S" "-d" "norsk") "~ascii" iso-8859-1) + ("norsk7-iso246" "[][A-Za-z{}|\\]" "[^][A-Za-z{}|\\]" + "[\".,;:]" nil ("-B" "-S" "-d" "norsk") "~iso246" iso-8859-1) +("nynorsk" ; 8 bit Norwegian mode + "[A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[^A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[\".,;:]" t ("-B" "-S" "-d" "nynorsk") "~list" iso-8859-1) + ("nynorsk7-tex" ; 7 bit Norwegian TeX mode + "[A-Za-z{}\\'^`@]" "[^A-Za-z{}\\'^`@]" + "[\".,;:]" t ("-B" "-S" "-d" "nynorsk" "-T" "plaintex") "~plaintex" nil) + ("nynorsk7-html" ; 7 bit Norwegian html mode + "[A-Za-z\&;]" "[^A-Za-z\&;]" ; Don't use ispell's html-parser + "[.,:]" t ("-B" "-S" "-n" "-d" "nynorsk") "~html" iso-8859-1) + ("nynorsk7-ascii" ; 7 bit Norwegian (aa, ae, oe) + "[A-Za-z]" "[^A-Za-z]" + "[\".,;:]" t ("-B" "-S" "-d" "nynorsk") "~ascii" iso-8859-1) + ("nynorsk7-iso246" "[][A-Za-z{}|\\]" "[^][A-Za-z{}|\\]" + "[\".,;:]" nil ("-B" "-S" "-d" "nynorsk") "~iso246" iso-8859-1) + ("nynorsk-comp" ; 8 bit Norwegian mode + "[A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[^A-Za-z\305\306\307\310\311\322\323\324\330\345\346\347\350\351\362\363\364\370]" + "[\".,;:]" t ("-S" "-d" "nynorsk") "~list" iso-8859-1) + ("nynorsk7-tex-comp" ; 7 bit Norwegian TeX mode + "[A-Za-z{}\\'^`@]" "[^A-Za-z{}\\'^`@]" + "[\".,;:]" t ("-S" "-d" "nynorsk" "-T" "plaintex") "~plaintex" nil) + ("nynorsk7-html-comp" ; 7 bit Norwegian html mode + "[A-Za-z\&;]" "[^A-Za-z\&;]" ; Don't use ispell's html-parser + "[.,:]" t ("-S" "-n" "-d" "nynorsk") "~html" iso-8859-1) + ("nynorsk7-ascii-comp" ; 7 bit Norwegian (aa, ae, oe) + "[A-Za-z]" "[^A-Za-z]" + "[\".,;:]" t ("-S" "-d" "nynorsk") "~ascii" iso-8859-1) + ("nynorsk7-iso246" "[][A-Za-z{}|\\]" "[^][A-Za-z{}|\\]" + "[\".,;:]" nil ("-B" "-S" "-d" "nynorsk") "~iso246" iso-8859-1) + )) + +(load-library "ispell") + +The above is very unpretty indeed. It is basically four copies of the +same list. If you come up with something better, please let me know. +I am a terrible lisp programmer! + +As you see there are a lot of entries. The -comp entries puts ispell +in controlled compoundwords mode. Nice to do for a quick spell-check. +I recommend to delete the entries you you don't plan to use. I like +to use the -S switch, e.g. not sort the suggestions made by ispell. +Then it is more likely that the correct suggestion will be early in +the list. + +In the future I hope that ispell will be able to sort the suggestions +it makes by commonness, at least for the most common words. That +should not be too difficult to implement. Just load the most common +words and their frequency indicator into memory, and do the nessesary +lookups. Or use the external look program. Suggestions and +implementations are most welcome! + +There is also a file flyspell.el around. This also offers +spell-checking on the fly, and the interface is more like m$-word. +Flyspell-mode highlights incorrect words, and you can even click on +them to get suggestions for correct spelling. Being able to sort on +commonness would make flyspell's auto-correction mode much more +useful! + + +USING ISPELL IN BATCH MODE + +I find ispell's batch mode very useful. The command + +cat myfile.tex | ispell -l -d norsk | sort | uniq -c | sort -n -r -s + +prints all words in myfile.tex that is not in the Norwegian +dictionary, where the most common words comes first. Nice to spot +errors, or as a starting point for a local dictionary. + + +HYPHENATION IN TEX + +Two sets of hyphenation patterns for the Norwegian language are +provided. The file norskb.tex hyphenates almost as TeX used to, and +the file nohyphbc.tex only splits compound words. + +It is fairly easy to install the nohyphb.tex file. Just put it where +TeX can find it, edit the file language.dat to point to the correct +file, and remake the formats. If you use teTeX you just say texconfig +init. + +If you want to install both sets of patterns, you have a TeX capacity +problem. The variable ssup_tree_size needs to be bigger than 65535 +and trie_op_size bigger than 1501. I use 262142 and 3501. So you +need to change tex.ch (and omega.ch) and recompile TeX. If you are +using teTeX that should be quite easy. Here is a patch: + +*** tex.ch~ Fri Jan 21 23:13:24 2000 +--- tex.ch Mon Jul 10 18:46:15 2000 +*************** +*** 196 **** +! @d ssup_trie_size == 65535 +--- 196 ---- +! @d ssup_trie_size == 262143 +*************** +*** 215 **** +! @!trie_op_size=1501; {space for ``opcodes'' in the hyphenation patterns; +--- 215 ---- +! @!trie_op_size=3501; {space for ``opcodes'' in the hyphenation patterns; +*************** +*** 217 **** +! @!neg_trie_op_size=-1501; {for lower |trie_op_hash| array bound; +--- 217 ---- +! @!neg_trie_op_size=-3501; {for lower |trie_op_hash| array bound; + +*** omega.ch~ Thu Jul 13 11:37:08 2000 +--- omega.ch Sun Jul 23 20:38:03 2000 +*************** +*** 125,127 **** + @d ssup_trie_opcode == 65535 +! @d ssup_trie_size == 100000 + +--- 125,127 ---- + @d ssup_trie_opcode == 65535 +! @d ssup_trie_size == 262143 + +*************** +*** 139,143 **** + {Use |hash_offset=0| for compilers which cannot decrement pointers.} +! @!trie_op_size=1501; {space for ``opcodes'' in the hyphenation patterns; + best if relatively prime to 313, 361, and 1009.} +! @!neg_trie_op_size=-1501; {for lower |trie_op_hash| array bound; + must be equal to |-trie_op_size|.} +--- 139,143 ---- + {Use |hash_offset=0| for compilers which cannot decrement pointers.} +! @!trie_op_size=3501; {space for ``opcodes'' in the hyphenation patterns; + best if relatively prime to 313, 361, and 1009.} +! @!neg_trie_op_size=-3501; {for lower |trie_op_hash| array bound; + must be equal to |-trie_op_size|.} + + +The easiest way to use the norskbc patterns is to define the macros + +\def\goodhyphens{\lefthyphenmin2\righthyphenmin2\language=\l@norskc} +\def\allhyphens{\lefthyphenmin1\righthyphenmin2\language=\l@norsk} + +and change whenever you want to. A better solution might be to define +norskc as another language in the Babel system anf use the Babel +language switching system. + + +MAKING IT PERFECT + +So you have installed these great new patterns. But TeX still might +fail on Norwegian words not in the dictionary, so if you don't feel +particularly lucky you will have to do something about that too. + +There are two strategies. I tend to prefer the second one. + +1. Mark the compound point in the compound word with "-, e.g. + administrasjons"-sjef"-stillings-"s�ker. If you have patched + ispell, you can do this during spell-checking most of the time. + +2. Use the script inorsk-hyphenmaybe to print every word in your + document not in the dictionary (nynorsk and bokm�l) hyphenated by + TeX. Then you can easily browse through this list and put the + badly hyphenated words in a \hyphenation command. The next time + you run the script it should produce correct hyphenation. + + For example if inorsk-hyphenmaybe outputs `kon-flik-t-akse' and + `kon-flik-t-ak-sen' you have to say \hyphenation{kon-flikt-akse + `kon-flikt-ak-sen'} in your TeX document. + +But we are not done with hyphenation yet. Have you ever considered +the problem of hyphenating the word `villede' in TeX. Of course you +have. The hyphenation should be `vill-lede', thus an extra `l' should +be added. + +Most languages which have such hyphenation (in particular German, with +ss) support this in Babel. The convention is that you code villede as +vi"llede. Of course the Norwegian dictionary supports this. Babel-3.7 +will also support this for Norwegian. Till then you can use the file +norsk.cfg to get this functionality (and some special hyphen points in +addition). The file itself offers more information. + + +THE FUTURE OF HYPHENATION IN TEX + +In standard TeX today it is not possible to say that one hyphen point +is better than another, e.g. I like barnehage-assistent better than +barne-hageassistent. In the future TeX will be able to handle +multiple classes of hyphens and different penalties can be assigned to +each class. Mathias Clasen has implemented this as a change file, +but it has not made it into the standard distributions yet. The stuff +at the end of the patterns/Makefile is about generating hyphenation +patterns for such a TeX. + + +LETS MAKE THE DICTIONARY EVEN BETTER! + +In the future I would like to add more word categories to the +dictionary. If you have a lot of text from within one field of +knowledge, and would like to help, you can start by saying + +cat allmytextfiles | inorsk-hyphenmaybe -e -p norskbc > mywords + +You should install the hyphenation patterns norskbc for Norwegian to +get hyphenation only at compound points, and of course the full +dictionary with no words filtered out. + +You will probably spot some new words, some of your own spelling +errors and some hyphenation errors. Fix that file, add flags defined +in the affix file etc. + +Next you have to learn to use the munchlist program. Suppose you have +the words in the file mywords + +gjennom-str�mnings-mekanisme +gjennom-str�mnings-mekanismen +gjennom-str�mnings-mekanismens +gjennom-str�mnings-mekanismer +gjennom-str�mnings-mekanismene + +cat mywords \ + | tr '-' '�' \ + | munchlist -v -l norsk.aff.munch \ + | tr '�' '-' + +the output should be + +gjennom-str�mnings-mekanisme/AEG + +which represents these five words. (Of course this only work if +ispell and munchlist is correctly installed.) + +Here is some elisp stuff I have used (provided as is, probably very badly coded): + +(defun ispell-expand-affixes () (interactive) + (shell-command-on-region (mark) (point) "sed -e \"s/[-0-9 :]//g\" | ispell -e -d norsk")) + +(defun ispell-collect-affixes () (interactive) + (shell-command (concat + "echo \"" (buffer-substring-no-properties (mark) (point)) + "\" | sed -e \"s/-/�/g\" -e \"s/[0-9 :]//g\" | " + "munchlist -l norsk.aff.munch | sed -e \"s/�/-/g\" &"))) + +(defun ispell-expand-line () (interactive) + (save-excursion + (beginning-of-line) + (let ((beg (point))) + (end-of-line) + (let ((end (point)))) + (shell-command-on-region beg (point) "sed -e \"s/[-0-9 :]//g\" | ispell -d norsk -e")))) + +; We have to quote the `' characters to protect them from shell +; expansion. + +(defun current-line () + (save-excursion + (beginning-of-line) + (let ((beg (point))) + (end-of-line) + (let ((end (point))) + (setq myvar (buffer-substring-no-properties beg end)) + (while (string-match " .*" myvar) + (setq myvar (replace-match "" nil nil myvar))) + (while (string-match "\\([^\\]\\)\\([`'\"]\\|\\\\$\\)" myvar) + (setq myvar (replace-match "\\1\\\\\\2" nil nil myvar))) + (while (string-match "[0-9 \t:.*]" myvar) + (setq myvar (replace-match "" nil nil myvar))) + myvar)))) + +(defun current-region () + (setq myvar (buffer-substring-no-properties (mark) (point))) + (while (string-match "\\([^\\]\\)\\([`'\"]\\|\\\\$\\)" myvar) + (setq myvar (replace-match "\\1\\\\\\2" nil nil myvar))) + (while (string-match "[0-9 \t]" myvar) + (setq myvar (replace-match "" nil nil myvar))) + myvar) + + diff --git a/hyph.txt b/hyph.txt new file mode 100644 index 0000000..bbca905 --- /dev/null +++ b/hyph.txt @@ -0,0 +1,2 @@ +nb,NO,hyph_nb_NO,Norwegian Bokm�l (Norway),hyph_nb_NO.zip +nn,NO,hyph_nn_NO,Norwegian Nynorsk (Norway),hyph_nn_NO.zip diff --git a/hyph_nb_NO.zip b/hyph_nb_NO.zip new file mode 100644 index 0000000..5861597 Binary files /dev/null and b/hyph_nb_NO.zip differ diff --git a/hyph_nn_NO.zip b/hyph_nn_NO.zip new file mode 100644 index 0000000..a98f13e Binary files /dev/null and b/hyph_nn_NO.zip differ diff --git a/nb_NO.zip b/nb_NO.zip new file mode 100644 index 0000000..2816845 Binary files /dev/null and b/nb_NO.zip differ diff --git a/nn_NO.zip b/nn_NO.zip new file mode 100644 index 0000000..5ca177d Binary files /dev/null and b/nn_NO.zip differ diff --git a/spell.txt b/spell.txt new file mode 100644 index 0000000..4bad99c --- /dev/null +++ b/spell.txt @@ -0,0 +1,2 @@ +nb,NO,nb_NO,Norwegian Bokm�l (Norway),nb_NO.zip +nn,NO,nn_NO,Norwegian Nynorsk (Norway),nn_NO.zip diff --git a/th_nb_NO_v2.zip b/th_nb_NO_v2.zip new file mode 100644 index 0000000..678f4fb Binary files /dev/null and b/th_nb_NO_v2.zip differ diff --git a/th_nn_NO_v2.zip b/th_nn_NO_v2.zip new file mode 100644 index 0000000..3bb19fa Binary files /dev/null and b/th_nn_NO_v2.zip differ diff --git a/thes2.txt b/thes2.txt new file mode 100644 index 0000000..21994f5 --- /dev/null +++ b/thes2.txt @@ -0,0 +1,2 @@ +nb,NO,th_nb_NO_v2,Norwegian Bokm�l (Norway),th_nb_NO_v2.zip +nn,NO,th_nn_NO_v2,Norwegian Nynorsk (Norway),th_nn_NO_v2.zip