Blob Blame History Raw
/* The contents of this file are subject to the Mozilla Public License Version 
 * 1.1 (the "License"); you may not use this file except in compliance with 
 * the License. You may obtain a copy of the License at 
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Original Code is Libvoikko: Library of natural language processing tools.
 * The Initial Developer of the Original Code is Harri Pitkänen <hatapitk@iki.fi>.
 * Portions created by the Initial Developer are Copyright (C) 2006 - 2010
 * the Initial Developer. All Rights Reserved.
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *********************************************************************************/

#include "../voikko.h"
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cwchar>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <cassert>

#ifdef HAVE_CONFIG_H
  #include "config.h"
#endif
#ifdef HAVE_PTHREAD
  #include <pthread.h>
#endif

using namespace std;


enum CheckMode {NORMAL, ONLY_C_W, ONLY_INCORRECT, NONE};

static const int MAX_WORD_LENGTH = 5000;
static const int MAX_THREADS = 200;
static const size_t WORDS_PER_BLOCK = 500;

static CheckMode checkMode = NORMAL;
static bool suggest = false;
static bool morphology = false;
static bool oneLineOutput = false;
static char wordSeparator = ' ';
static bool space = false;  /* Set to true if you want to output suggestions that have spaces in them. */
static int threadCount = 1;

struct speller_t {
	VoikkoHandle * handle;
	vector<wstring> * words;
};

static speller_t * spellers;

static void printMorphology(VoikkoHandle * handle, const wchar_t * word, wstringstream & out) {
	voikko_mor_analysis ** analysisList =
	    voikkoAnalyzeWordUcs4(handle, word);
	for (voikko_mor_analysis ** analysis = analysisList;
	     *analysis; analysis++) {
		const char ** keys = voikko_mor_analysis_keys(*analysis);
		for (const char ** key = keys; *key; key++) {
			out << L"A(" << word << L"):";
			out << (analysis - analysisList) + 1 << L":";
			out << *key << L"=";
			out << voikko_mor_analysis_value_ucs4(*analysis, *key);
			out << endl;
		}
	}
	voikko_free_mor_analysis(analysisList);
}

static void check_word(VoikkoHandle * handle, const wchar_t * word, wstringstream & out) {
	int result = (checkMode != NONE ? voikkoSpellUcs4(handle, word) : VOIKKO_SPELL_OK);
	if (result == VOIKKO_CHARSET_CONVERSION_FAILED) {
		cerr << "E: charset conversion failed" << endl;
		return;
	}
	if (result == VOIKKO_INTERNAL_ERROR) {
		cerr << "E: internal error" << endl;
		return;
	}
	
	if (oneLineOutput) {
		out << word;
		if (!result) {
			wchar_t ** suggestions = voikkoSuggestUcs4(handle, word);
			if (suggestions) {
				for (int i = 0; suggestions[i] != 0; i++) {
					if (space || wcschr(suggestions[i], L' ') == 0) {
						out << wordSeparator;
						out << suggestions[i];
					}
				}
				voikko_free_suggest_ucs4(suggestions);
			}
		}
		out << endl;
	} else {
		switch (checkMode) {
			case NORMAL:
				if (result) {
					out << L"C: " << word << endl;
				} else {
					out << L"W: " << word << endl;
				}
				break;
			case ONLY_C_W:
				if (result) {
					out << L"C" << endl;
				} else {
					out << L"W" << endl;
				}
				break;
			case ONLY_INCORRECT:
				if (!result) {
					out << word << endl;
				}
				break;
			case NONE:
				break;
		}
	}
	
	if (morphology && result) {
		printMorphology(handle, word, out);
	}
	if (!oneLineOutput && suggest && !result) {
		wchar_t ** suggestions = voikkoSuggestUcs4(handle, word);
		if (suggestions) {
			for (int i = 0; suggestions[i] != 0; i++) {
				out << L"S: " << suggestions[i] << endl;
			}
			voikko_free_suggest_ucs4(suggestions);
		}
	}
}

#ifdef HAVE_PTHREAD

static void * processBlock(void * args) {
	speller_t * speller = static_cast<speller_t *>(args);
	wstringstream out;
	vector<wstring>::const_iterator it = speller->words->begin();
	while (it != speller->words->end()) {
		check_word(speller->handle, it->c_str(), out);
		++it;
	}
	delete speller->words;
	return new wstring(out.str());
}

pthread_t * threads;
int nextThread;
int nThreadsInUse;
vector<wstring> * nextBlock;

static void initNextBlock() {
	nextBlock = new vector<wstring>();
	nextBlock->reserve(WORDS_PER_BLOCK);
}

static void cleanupThread() {
	void * result;
	if (pthread_join(threads[nextThread], &result)) {
		cerr << "E: pthread_join failed" << endl;
		exit(1);
	}
	--nThreadsInUse;
	wstring * resultString = static_cast<wstring *>(result);
	wcout << *resultString;
	delete resultString;
}

static void queueNextBlock() {
	if (nextBlock->empty()) {
		return;
	}
	if (nThreadsInUse == threadCount) {
		cleanupThread();
	}
	spellers[nextThread].words = nextBlock;
	if (pthread_create(threads + nextThread, 0, &processBlock, spellers + nextThread)) {
		cerr << "E: Failed to create thread" << endl;
		exit(1);
	}
	assert(threadCount >= 1);
	nextThread = (nextThread + 1) % threadCount;
	++nThreadsInUse;
	initNextBlock();
}

static void handleWordMultiThread(const wchar_t * word) {
	nextBlock->push_back(wstring(word));
	if (nextBlock->size() == WORDS_PER_BLOCK) {
		queueNextBlock();
	}
}

#endif

static void finishProcessing() {
	#ifdef HAVE_PTHREAD
		queueNextBlock();
		while (nThreadsInUse > 0) {
			cleanupThread();
			nextThread = (nextThread + 1) % threadCount;
		}
	#endif
}

static void initThreads() {
	#ifdef HAVE_PTHREAD
		threads = new pthread_t[threadCount];
		nextThread = 0;
		nThreadsInUse = 0;
		initNextBlock();
	#endif
}

static void handleWordSingleThread(const wchar_t * word) {
	wstringstream out;
	check_word(spellers[0].handle, word, out);
	wcout << out.str();
	fflush(0);
}

static void handleWord(const wchar_t * word) {
	#ifdef HAVE_PTHREAD
		if (threadCount == 1) {
			handleWordSingleThread(word);
		} else {
			handleWordMultiThread(word);
		}
	#else
		handleWordSingleThread(word);	
	#endif
}

static void setBooleanOption(int option, int value) {
	for (int i = 0; i < threadCount; i++) {
		voikkoSetBooleanOption(spellers[i].handle, option, value);
	}
}

/**
 * Print a list of available dictionaries to stdout.
 * @return status code to be returned when the program exits.
 */
static int list_dicts(const char * path) {
	voikko_dict ** dicts = voikko_list_dicts(path);
	if (!dicts) {
		cerr << "E: Failed to list available dictionaries." << endl;
		return 1;
	}
	for (voikko_dict ** i = dicts; *i; i++) {
		cout << voikko_dict_language(*i);
		string script(voikko_dict_script(*i));
		if (script != "") {
			cout << "-" << script;
		}
		cout << "-x-";
		cout << voikko_dict_variant(*i);
		cout << ": ";
		cout << voikko_dict_description(*i);
		cout << endl;
	}
	voikko_free_dicts(dicts);
	return 0;
}

static void printAndDeleteCapabilityList(const char * capability, char ** languageCodes) {
	for (char ** i = languageCodes; *i; i++) {
		cout << capability;
		cout << ":";
		cout << *i;
		cout << endl;
	}
	voikkoFreeCstrArray(languageCodes);
}

/**
 * Print a list of available dictionary capabilities to stdout.
 * @return status code to be returned when the program exits.
 */
static int list_capabilities(const char * path) {
	char ** languageCodes = voikkoListSupportedSpellingLanguages(path);
	if (!languageCodes) {
		cerr << "E: Failed to list dictionaries with spell checking capability" << endl;
		return 1;
	}
	printAndDeleteCapabilityList("spell", languageCodes);
	
	languageCodes = voikkoListSupportedHyphenationLanguages(path);
	if (!languageCodes) {
		cerr << "E: Failed to list dictionaries with hyphenation capability" << endl;
		return 1;
	}
	printAndDeleteCapabilityList("hyphen", languageCodes);
	
	languageCodes = voikkoListSupportedGrammarCheckingLanguages(path);
	if (!languageCodes) {
		cerr << "E: Failed to list dictionaries with grammar checking capability" << endl;
		return 1;
	}
	printAndDeleteCapabilityList("grammar", languageCodes);
	
	return 0;
}


static void printHelp() {
	cout << "Usage: voikkospell [OPTION]..." << endl;
	cout << "Check spelling of words read from stdin." << endl;
	cout << endl;
	cout << "    -s   Print suggestions for misspelled words" << endl;
	cout << "    -m   Print morphological analysis for recongized words" << endl;
	cout << endl;
	cout << "For complete descriptions of available options see 'man voikkospell'" << endl;
}

static void printVersion() {
	#ifdef PACKAGE_VERSION
		cout << "voikkospell version " << PACKAGE_VERSION << endl;
	#endif
	cout << "libvoikko version " << voikkoGetVersion() << endl;
}

int main(int argc, char ** argv) {
	const char * path = 0;
	const char * variant = "fi";
	int cache_size;
	
	cache_size = 0;
	bool list_dicts_requested = false;
	bool list_capabilities_requested = false;
	for (int i = 1; i < argc; i++) {
		string args(argv[i]);
		if (args.find("-c") == 0) {
			cache_size = atoi(argv[i] + 2);
		}
		else if (args == "-p" && i + 1 < argc) {
			path = argv[++i];
		}
		else if (args == "-d" && i + 1 < argc) {
			variant = argv[++i];
		}
		else if (args == "-h" || args == "--help") {
			printHelp();
			exit(0);
		}
		else if (args == "--version") {
			printVersion();
			exit(0);
		}
		else if (args == "-l") {
			list_dicts_requested = true;
		}
		else if (args == "-L") {
			list_capabilities_requested = true;
		}
		else if (args == "-j") {
			#ifdef HAVE_PTHREAD
				if (i + 1 == argc) {
					cerr << "-j must be followed by number of threads" << endl;
					return 1;
				}
				threadCount = atoi(argv[++i]);
				if (threadCount <= 0 || threadCount > MAX_THREADS) {
					cerr << "Number of threads must be between 1 and " << MAX_THREADS << endl;
					return 1;
				}
			#else
				cerr << "Support for threaded operation is not available" << endl;
				return 1;
			#endif
		}
	}
	
	if (list_dicts_requested) {
		return list_dicts(path);
	}
	
	if (list_capabilities_requested) {
		return list_capabilities(path);
	}
	
	spellers = new speller_t[threadCount];
	for (int i = 0; i < threadCount; i++) {
		const char * voikkoError;
		VoikkoHandle * handle = voikkoInit(&voikkoError, variant, path);
		if (!handle) {
			cerr << "E: Initialization of Voikko failed: " << voikkoError << endl;
			return 1;
		}
		voikkoSetIntegerOption(handle, VOIKKO_SPELLER_CACHE_SIZE, cache_size);
		spellers[i].handle = handle;
		spellers[i].words = 0;
	}
	
	for (int i = 1; i < argc; i++) {
		string args(argv[i]);
		if (args == "-t") {
			checkMode = ONLY_C_W;
		} else if (args == "-tt") {
			checkMode = ONLY_INCORRECT;
		} else if (args == "ignore_dot=1")
			setBooleanOption(VOIKKO_OPT_IGNORE_DOT, 1);
		else if (args == "ignore_dot=0")
			setBooleanOption(VOIKKO_OPT_IGNORE_DOT, 0);
		else if (args == "ignore_numbers=1")
			setBooleanOption(VOIKKO_OPT_IGNORE_NUMBERS, 1);
		else if (args == "ignore_numbers=0")
			setBooleanOption(VOIKKO_OPT_IGNORE_NUMBERS, 0);
		else if (args == "ignore_nonwords=1")
			setBooleanOption(VOIKKO_OPT_IGNORE_NONWORDS, 1);
		else if (args == "ignore_nonwords=0")
			setBooleanOption(VOIKKO_OPT_IGNORE_NONWORDS, 0);
		else if (args == "accept_first_uppercase=1")
			setBooleanOption(VOIKKO_OPT_ACCEPT_FIRST_UPPERCASE, 1);
		else if (args == "accept_first_uppercase=0")
			setBooleanOption(VOIKKO_OPT_ACCEPT_FIRST_UPPERCASE, 0);
		else if (args == "accept_all_uppercase=1")
			setBooleanOption(VOIKKO_OPT_ACCEPT_ALL_UPPERCASE, 1);
		else if (args == "accept_all_uppercase=0")
			setBooleanOption(VOIKKO_OPT_ACCEPT_ALL_UPPERCASE, 0);
		else if (args == "accept_extra_hyphens=1")
			setBooleanOption(VOIKKO_OPT_ACCEPT_EXTRA_HYPHENS, 1);
		else if (args == "accept_extra_hyphens=0")
			setBooleanOption(VOIKKO_OPT_ACCEPT_EXTRA_HYPHENS, 0);
		else if (args == "accept_missing_hyphens=1")
			setBooleanOption(VOIKKO_OPT_ACCEPT_MISSING_HYPHENS, 1);
		else if (args == "accept_missing_hyphens=0")
			setBooleanOption(VOIKKO_OPT_ACCEPT_MISSING_HYPHENS, 0);
		else if (args == "ocr_suggestions=1")
			setBooleanOption(VOIKKO_OPT_OCR_SUGGESTIONS, 1);
		else if (args == "ocr_suggestions=0")
			setBooleanOption(VOIKKO_OPT_OCR_SUGGESTIONS, 0);
		else if (args.find("-x") == 0) {
			oneLineOutput = true;
			if (args.size() == 3) {
				wordSeparator = argv[i][2];
			}
			space = (wordSeparator != ' ');
		}
		else if (args == "-s") {
			suggest = true;
		}
		else if (args == "-m") {
			morphology = true;
		}
		else if (args == "-M") {
			morphology = true;
			checkMode = NONE;
		}
		else if (args.find("-c") == 0) {
			continue;
		}
		else if (args == "-p" || args == "-d" || args == "-j") {
			i++;
			continue;
		}
		else {
			cerr << "Unknown option " << args << endl;
			return 1;
		}
	}
	
	wchar_t * line = new wchar_t[MAX_WORD_LENGTH + 1];
	
	// Use stdout in wide character mode and stderr in narrow character mode.
	setlocale(LC_ALL, "");
	wcout.imbue(locale(""));
	fwide(stdout, 1);
	fwide(stderr, -1);
	initThreads();
	while (fgetws(line, MAX_WORD_LENGTH, stdin)) {
		size_t lineLen = wcslen(line);
		if (lineLen == 0) {
			continue;
		}
		if (line[lineLen - 1] == L'\n') {
			line[lineLen - 1] = L'\0';
			lineLen--;
		}
		if (lineLen > LIBVOIKKO_MAX_WORD_CHARS) {
			cerr << "E: Too long word" << endl;
			continue;
		}
		handleWord(line);
	}
	finishProcessing();
	int error = ferror(stdin);
	if (error) {
		cerr << "E: Error while reading from stdin" << endl;
	}
	delete[] line;
	
	for (int i = 0; i < threadCount; i++) {
		voikkoTerminate(spellers[i].handle);
	}
	delete[] spellers;
	#ifdef HAVE_PTHREAD
		delete nextBlock;
		delete[] threads;
	#endif
	return 0;
}