Blob Blame History Raw
import re

from texcodec import LatexCodec, TexCodec
from texhyphen import BasicHyphenator, UrlHyphenator


def utf8(u):
    return u.encode("utf8")

class RawKey:
    def __init__(self, key, incr):
        self.key = key
        self.depth = incr
        self.pos = -1
        self.len = len(key)

class RawLatexParser:
    def __init__(self,
                 key_in=utf8(u"\u0370t"), key_out=utf8(u"\u0371t"),
                 codec=None, output_encoding="latin-1"):
        self.key_in = RawKey(key_in, 1)
        self.key_out = RawKey(key_out, -1)
        self.depth = 0
        self.hyphenate = 0
        self.codec = codec or LatexCodec(output_encoding=output_encoding)
        #self.hyphenator = BasicHyphenator(codec=self.codec)
        self.hyphenator = UrlHyphenator(codec=self.codec)
        
        # hyphenation patterns
        self.hypon = re.compile(utf8(u"\u0370h"))
        self.hypof = re.compile(utf8(u"\u0371h"))

    def parse(self, line):
        lout = ""
        while (line):
            self.key_in.pos = line.find(self.key_in.key)
            self.key_out.pos = line.find(self.key_out.key)

            if (self.key_out.pos == -1 or
                (self.key_in.pos >= 0 and
                (self.key_in.pos < self.key_out.pos))):
                key = self.key_in
            else:
                key = self.key_out

            if key.pos != -1:
                text = line[:key.pos]
                line = line[key.pos + key.len:]
            else:
                text = line
                line = ""

            if (text):
                if self.depth > 0:
                    lout += self.translate(text)
                else:
                    text, hon = self.hypon.subn("", text)
                    text, hof = self.hypof.subn("", text)
                    self.hyphenate += (hon - hof)
                    lout += text

            if key.pos != -1:
                self.depth += key.depth

        return lout

    def translate(self, text):
        # Now hyphenate if needed
        if self.hyphenate:
            text = self.hyphenator.hyphenate(text)
        else:
            text = self.codec.decode(text)
            text = self.codec.encode(text)
        return text


class RawUtfParser(RawLatexParser):
    "Just encode from UTF-8 without latex escaping"

    def __init__(self, codec=None, output_encoding="latin-1"):
        texcodec = codec or TexCodec(output_encoding=output_encoding)
        RawLatexParser.__init__(self, utf8(u"\u0370u"), utf8(u"\u0371u"),
                                texcodec)

    def translate(self, text):
        # Currently no hyphenation stuff, just encode
        text = self.codec.decode(text)
        return self.codec.encode(text)