Blob Blame History Raw
/* The contents of this file are subject to the Mozilla Public License Version 
 * 1.1 (the "License"); you may not use this file except in compliance with 
 * the License. You may obtain a copy of the License at 
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 * 
 * The Original Code is Libvoikko: Library of natural language processing tools.
 * The Initial Developer of the Original Code is Harri Pitkänen <hatapitk@iki.fi>.
 * Portions created by the Initial Developer are Copyright (C) 2008 - 2011
 * the Initial Developer. All Rights Reserved.
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *********************************************************************************/

#include "grammar/HfstAnalysis.hpp"
#include "tokenizer/Tokenizer.hpp"
#include "sentence/Sentence.hpp"
#include "utils/StringUtils.hpp"
#include "utils/utils.hpp"
#include <cstdlib>
#include <cstring>
#include <stdio.h>

//#include <hfst/HfstTransducer.h>
//#include <hfst-ol.h>

#include <iostream>  
#include <fstream>
#include <set>
#include <vector>

using namespace libvoikko::grammar;
using namespace hfst_ol;
//using namespace hfst;

namespace libvoikko {

HfstAnalysis::HfstAnalysis(morphology::Analyzer * a, voikko_options_t * options) : analyser(a), voikkoOptions(options) {
}


HfstAnalysis::~HfstAnalysis() {

}


/** Analyse given text token. Token type, length and text must have already
 *  been set. */
void HfstAnalysis::analyseToken(Token * token) {
	token->isValidWord = false;
	
	wchar_t * wordBuffer =
	    utils::StringUtils::stripSpecialCharsForMalaga(token->str,
	                                                   token->tokenlen);
	const std::string swordBuffer(utils::StringUtils::utf8FromUcs4(wordBuffer));
	fprintf(stderr, "HfstAnalysis::analyseToken (%ls)\n", wordBuffer);

	token->analyses = analyser->analyze(wordBuffer, wcslen(wordBuffer), false);

	//int num_analyses = 0;
	//num_analyses = token->analyses->size();	
	//fprintf(stderr, "HfstAnalysis::analyseToken (num: %d)\n", num_analyses);
}

/** Analyse sentence text. Sentence type must be set by the caller. */
Sentence * HfstAnalysis::analyseSentence(const wchar_t * text, size_t textlen, size_t sentencepos) {
	Sentence * s = new Sentence;
	s->pos = sentencepos;
	size_t tokenlen;
	const wchar_t * pos = text;
	size_t remaining = textlen;
	bool next_word_is_possible_sentence_start = false;
	for (int i = 0; i < Sentence::MAX_TOKENS_IN_SENTENCE; i++) {
		enum voikko_token_type tt;
		int ignore_dot_saved = voikkoOptions->ignore_dot;
		voikkoOptions->ignore_dot = 0;
		tt = tokenizer::Tokenizer::nextToken(voikkoOptions, pos, remaining, &tokenlen);
		voikkoOptions->ignore_dot = ignore_dot_saved;
		if (tt == TOKEN_NONE) return s;

		s->tokens[i].type = tt;
		s->tokens[i].tokenlen = tokenlen;
		wchar_t * tstr = new wchar_t[tokenlen + 1];
		if (!tstr) break;
		memcpy(tstr, pos, tokenlen * sizeof(wchar_t));
		tstr[tokenlen] = L'\0';
		s->tokens[i].str = tstr;
		s->tokens[i].pos = sentencepos + (pos - text);
		analyseToken(s->tokens + i);
		
		if (next_word_is_possible_sentence_start && tt == TOKEN_WORD) {
			s->tokens[i].possibleSentenceStart = true;
			next_word_is_possible_sentence_start = false;
		}
		else if (tt == TOKEN_PUNCTUATION &&
		         ((tokenlen == 1 && wcschr(L".:\u2026\u2013\u2014", tstr[0]) != 0)
		          || tokenlen == 3)) { // . : ... may separate sentences
			next_word_is_possible_sentence_start = true;
		}
		
		s->tokenCount++;
		pos += tokenlen;
		remaining -= tokenlen;
		if (!remaining) return s;
	}
	// Too long sentence or error
	delete s;
	return 0;
}


Paragraph * HfstAnalysis::analyseParagraph(const wchar_t * text, size_t textlen) {
	Paragraph * p = new Paragraph;
	const wchar_t * pos = text;
	size_t remaining = textlen;
	enum voikko_sentence_type st;
	do {
		const wchar_t * pos2 = pos;
		size_t sentencelen = 0;
		do {
			size_t sentencelen2;
			st = sentence::Sentence::next(voikkoOptions, pos2, remaining,
			                                     &sentencelen2);
			pos2 += sentencelen2;
			sentencelen += sentencelen2;
			remaining -= sentencelen2;
		} while (st == SENTENCE_POSSIBLE);
		
		Sentence * s = analyseSentence(pos, sentencelen, pos - text);
		if (!s) {
			delete p;
			return 0;
		}
		s->type = st;
		p->sentences[p->sentenceCount++] = s;
		pos += sentencelen;
	} while (st != SENTENCE_NONE && st != SENTENCE_NO_START &&
	         p->sentenceCount < Paragraph::MAX_SENTENCES_IN_PARAGRAPH);
	return p;
}

}