// Copyright (c) 1994 James Clark // See the file COPYING for copying permission. #ifdef __GNUG__ #pragma implementation #endif #include "splib.h" #include "types.h" #include "macros.h" #include "StringOf.h" #include "Trie.h" #include "TrieBuilder.h" #include "Priority.h" #include #ifdef SP_NAMESPACE namespace SP_NAMESPACE { #endif Trie::~Trie() { if (next_) delete [] next_; } Trie::Trie(const Trie &t) : nCodes_(t.nCodes_), token_(t.token_), tokenLength_(t.tokenLength_), priority_(t.priority_), blank_(t.blank_) { if (t.next_) { next_ = new Trie[nCodes_]; for (int i = 0; i < nCodes_; i++) next_[i] = t.next_[i]; } else next_ = 0; } Trie &Trie::operator=(const Trie &t) { if (next_) delete [] next_; nCodes_ = t.nCodes_; token_ = t.token_; tokenLength_ = t.tokenLength_; priority_ = t.priority_; blank_ = t.blank_; if (t.next_) { next_ = new Trie[nCodes_]; for (int i = 0; i < nCodes_; i++) next_[i] = t.next_[i]; } else next_ = 0; return *this; } TrieBuilder::TrieBuilder(int nCodes) : nCodes_(nCodes), root_(new Trie) { root_->token_ = 0; root_->tokenLength_ = 0; root_->priority_ = Priority::data; root_->nCodes_ = nCodes; } void TrieBuilder::recognize(const String &chars, Token t, Priority::Type priority, TokenVector &ambiguities) { setToken(extendTrie(root_.pointer(), chars), chars.size(), t, priority, ambiguities); } void TrieBuilder::recognize(const String &chars, const String &set, Token t, Priority::Type priority, TokenVector &ambiguities) { Trie *trie = extendTrie(root_.pointer(), chars); for (size_t i = 0; i < set.size(); i++) setToken(forceNext(trie, set[i]), chars.size() + 1, t, priority, ambiguities); } void TrieBuilder::recognizeB(const String &chars, int bSequenceLength, size_t maxBlankSequence, const String &blankCodes, const String &chars2, Token token, TokenVector &ambiguities) { doB(extendTrie(root_.pointer(), chars), chars.size(), bSequenceLength, maxBlankSequence, blankCodes, chars2, token, Priority::blank(bSequenceLength), ambiguities); } void TrieBuilder::recognizeEE(EquivCode code, Token t) { Trie *trie = forceNext(root_.pointer(), code); trie->tokenLength_ = 0; // it has length 0 in the buffer trie->token_ = t; trie->priority_ = Priority::data; } void TrieBuilder::doB(Trie *trie, int tokenLength, int minBLength, size_t maxLength, const String &blankCodes, const String &chars2, Token token, Priority::Type pri, TokenVector &ambiguities) { if (minBLength == 0 && trie->next_ == 0) { if (!trie->blank_) { BlankTrie *b = new BlankTrie; trie->blank_ = b; b->maxBlanksToScan_ = maxLength; b->additionalLength_ = tokenLength; b->codeIsBlank_.assign(nCodes_, 0); for (size_t i = 0; i < blankCodes.size(); i++) b->codeIsBlank_[blankCodes[i]] = 1; b->tokenLength_ = 0; b->token_ = 0; b->priority_ = Priority::data; b->nCodes_ = nCodes_; } else { // A B sequence is not allowed to be adjacent to a character // that can occur in a blank sequence, so maxLength will be // the same at a node, no matter how we got there. ASSERT(trie->blank_->maxBlanksToScan_ == maxLength); ASSERT(trie->blank_->additionalLength_ == tokenLength); } if (chars2.size() == 0) setToken(trie, tokenLength, token, pri, ambiguities); else setToken(extendTrie(trie->blank_.pointer(), chars2), chars2.size(), token, pri, ambiguities); } else { if (minBLength == 0) setToken(extendTrie(trie, chars2), tokenLength + chars2.size(), token, pri, ambiguities); for (size_t i = 0; i < blankCodes.size(); i++) doB(forceNext(trie, blankCodes[i]), tokenLength + 1, minBLength == 0 ? 0 : minBLength - 1, maxLength - 1, blankCodes, chars2, token, pri, ambiguities); } } Trie *TrieBuilder::extendTrie(Trie *trie, const String &s) { for (size_t i = 0; i < s.size(); i++) trie = forceNext(trie, s[i]); return trie; } void TrieBuilder::setToken(Trie *trie, int tokenLength, Token token, Priority::Type pri, TokenVector &ambiguities) { if (tokenLength > trie->tokenLength_ || (tokenLength == trie->tokenLength_ && pri > trie->priority_)) { trie->tokenLength_ = tokenLength; trie->token_ = token; trie->priority_ = pri; } else if (trie->tokenLength_ == tokenLength && trie->priority_ == pri && trie->token_ != token && trie->token_ != 0) { ambiguities.push_back(Token(trie->token_)); ambiguities.push_back(token); } if (trie->hasNext()) { for (int i = 0; i < nCodes_; i++) setToken(&trie->next_[i], tokenLength, token, pri, ambiguities); } } void TrieBuilder::copyInto(Trie *into, const Trie *from, int additionalLength) { if (from->token_ != 0) { TokenVector ambiguities; setToken(into, from->tokenLength_ + additionalLength, from->token_, from->priority_, ambiguities); ASSERT(ambiguities.size() == 0); } if (from->hasNext()) for (int i = 0; i < nCodes_; i++) copyInto(forceNext(into, i), &from->next_[i], additionalLength); } Trie *TrieBuilder::forceNext(Trie *trie, EquivCode c) { if (!trie->hasNext()) { trie->next_ = new Trie[nCodes_]; if (trie->blank_) { trie->blank_->additionalLength_ += 1; trie->blank_->maxBlanksToScan_ -= 1; } Owner blankOwner(trie->blank_.extract()); const BlankTrie *b = blankOwner.pointer(); for (int i = 0; i < nCodes_; i++) { Trie *p = &trie->next_[i]; if (b && b->codeIsBlank(i)) trie->next_[i].blank_ = (blankOwner ? blankOwner.extract() : new BlankTrie(*b)); p->token_ = trie->token_; p->tokenLength_ = trie->tokenLength_; p->priority_ = trie->priority_; p->nCodes_ = nCodes_; } if (b) // -1 because 1 was added above copyInto(trie, b, b->additionalLength_ - 1); } return &trie->next_[c]; } #ifdef SP_NAMESPACE } #endif