Blame Lib/encodings/punycode.py

rpm-build 2bd099
""" Codec for the Punicode encoding, as specified in RFC 3492
rpm-build 2bd099
rpm-build 2bd099
Written by Martin v. Löwis.
rpm-build 2bd099
"""
rpm-build 2bd099
rpm-build 2bd099
import codecs
rpm-build 2bd099
rpm-build 2bd099
##################### Encoding #####################################
rpm-build 2bd099
rpm-build 2bd099
def segregate(str):
rpm-build 2bd099
    """3.1 Basic code point segregation"""
rpm-build 2bd099
    base = bytearray()
rpm-build 2bd099
    extended = set()
rpm-build 2bd099
    for c in str:
rpm-build 2bd099
        if ord(c) < 128:
rpm-build 2bd099
            base.append(ord(c))
rpm-build 2bd099
        else:
rpm-build 2bd099
            extended.add(c)
rpm-build 2bd099
    extended = sorted(extended)
rpm-build 2bd099
    return bytes(base), extended
rpm-build 2bd099
rpm-build 2bd099
def selective_len(str, max):
rpm-build 2bd099
    """Return the length of str, considering only characters below max."""
rpm-build 2bd099
    res = 0
rpm-build 2bd099
    for c in str:
rpm-build 2bd099
        if ord(c) < max:
rpm-build 2bd099
            res += 1
rpm-build 2bd099
    return res
rpm-build 2bd099
rpm-build 2bd099
def selective_find(str, char, index, pos):
rpm-build 2bd099
    """Return a pair (index, pos), indicating the next occurrence of
rpm-build 2bd099
    char in str. index is the position of the character considering
rpm-build 2bd099
    only ordinals up to and including char, and pos is the position in
rpm-build 2bd099
    the full string. index/pos is the starting position in the full
rpm-build 2bd099
    string."""
rpm-build 2bd099
rpm-build 2bd099
    l = len(str)
rpm-build 2bd099
    while 1:
rpm-build 2bd099
        pos += 1
rpm-build 2bd099
        if pos == l:
rpm-build 2bd099
            return (-1, -1)
rpm-build 2bd099
        c = str[pos]
rpm-build 2bd099
        if c == char:
rpm-build 2bd099
            return index+1, pos
rpm-build 2bd099
        elif c < char:
rpm-build 2bd099
            index += 1
rpm-build 2bd099
rpm-build 2bd099
def insertion_unsort(str, extended):
rpm-build 2bd099
    """3.2 Insertion unsort coding"""
rpm-build 2bd099
    oldchar = 0x80
rpm-build 2bd099
    result = []
rpm-build 2bd099
    oldindex = -1
rpm-build 2bd099
    for c in extended:
rpm-build 2bd099
        index = pos = -1
rpm-build 2bd099
        char = ord(c)
rpm-build 2bd099
        curlen = selective_len(str, char)
rpm-build 2bd099
        delta = (curlen+1) * (char - oldchar)
rpm-build 2bd099
        while 1:
rpm-build 2bd099
            index,pos = selective_find(str,c,index,pos)
rpm-build 2bd099
            if index == -1:
rpm-build 2bd099
                break
rpm-build 2bd099
            delta += index - oldindex
rpm-build 2bd099
            result.append(delta-1)
rpm-build 2bd099
            oldindex = index
rpm-build 2bd099
            delta = 0
rpm-build 2bd099
        oldchar = char
rpm-build 2bd099
rpm-build 2bd099
    return result
rpm-build 2bd099
rpm-build 2bd099
def T(j, bias):
rpm-build 2bd099
    # Punycode parameters: tmin = 1, tmax = 26, base = 36
rpm-build 2bd099
    res = 36 * (j + 1) - bias
rpm-build 2bd099
    if res < 1: return 1
rpm-build 2bd099
    if res > 26: return 26
rpm-build 2bd099
    return res
rpm-build 2bd099
rpm-build 2bd099
digits = b"abcdefghijklmnopqrstuvwxyz0123456789"
rpm-build 2bd099
def generate_generalized_integer(N, bias):
rpm-build 2bd099
    """3.3 Generalized variable-length integers"""
rpm-build 2bd099
    result = bytearray()
rpm-build 2bd099
    j = 0
rpm-build 2bd099
    while 1:
rpm-build 2bd099
        t = T(j, bias)
rpm-build 2bd099
        if N < t:
rpm-build 2bd099
            result.append(digits[N])
rpm-build 2bd099
            return bytes(result)
rpm-build 2bd099
        result.append(digits[t + ((N - t) % (36 - t))])
rpm-build 2bd099
        N = (N - t) // (36 - t)
rpm-build 2bd099
        j += 1
rpm-build 2bd099
rpm-build 2bd099
def adapt(delta, first, numchars):
rpm-build 2bd099
    if first:
rpm-build 2bd099
        delta //= 700
rpm-build 2bd099
    else:
rpm-build 2bd099
        delta //= 2
rpm-build 2bd099
    delta += delta // numchars
rpm-build 2bd099
    # ((base - tmin) * tmax) // 2 == 455
rpm-build 2bd099
    divisions = 0
rpm-build 2bd099
    while delta > 455:
rpm-build 2bd099
        delta = delta // 35 # base - tmin
rpm-build 2bd099
        divisions += 36
rpm-build 2bd099
    bias = divisions + (36 * delta // (delta + 38))
rpm-build 2bd099
    return bias
rpm-build 2bd099
rpm-build 2bd099
rpm-build 2bd099
def generate_integers(baselen, deltas):
rpm-build 2bd099
    """3.4 Bias adaptation"""
rpm-build 2bd099
    # Punycode parameters: initial bias = 72, damp = 700, skew = 38
rpm-build 2bd099
    result = bytearray()
rpm-build 2bd099
    bias = 72
rpm-build 2bd099
    for points, delta in enumerate(deltas):
rpm-build 2bd099
        s = generate_generalized_integer(delta, bias)
rpm-build 2bd099
        result.extend(s)
rpm-build 2bd099
        bias = adapt(delta, points==0, baselen+points+1)
rpm-build 2bd099
    return bytes(result)
rpm-build 2bd099
rpm-build 2bd099
def punycode_encode(text):
rpm-build 2bd099
    base, extended = segregate(text)
rpm-build 2bd099
    deltas = insertion_unsort(text, extended)
rpm-build 2bd099
    extended = generate_integers(len(base), deltas)
rpm-build 2bd099
    if base:
rpm-build 2bd099
        return base + b"-" + extended
rpm-build 2bd099
    return extended
rpm-build 2bd099
rpm-build 2bd099
##################### Decoding #####################################
rpm-build 2bd099
rpm-build 2bd099
def decode_generalized_number(extended, extpos, bias, errors):
rpm-build 2bd099
    """3.3 Generalized variable-length integers"""
rpm-build 2bd099
    result = 0
rpm-build 2bd099
    w = 1
rpm-build 2bd099
    j = 0
rpm-build 2bd099
    while 1:
rpm-build 2bd099
        try:
rpm-build 2bd099
            char = ord(extended[extpos])
rpm-build 2bd099
        except IndexError:
rpm-build 2bd099
            if errors == "strict":
rpm-build 2bd099
                raise UnicodeError("incomplete punicode string")
rpm-build 2bd099
            return extpos + 1, None
rpm-build 2bd099
        extpos += 1
rpm-build 2bd099
        if 0x41 <= char <= 0x5A: # A-Z
rpm-build 2bd099
            digit = char - 0x41
rpm-build 2bd099
        elif 0x30 <= char <= 0x39:
rpm-build 2bd099
            digit = char - 22 # 0x30-26
rpm-build 2bd099
        elif errors == "strict":
rpm-build 2bd099
            raise UnicodeError("Invalid extended code point '%s'"
rpm-build 2bd099
                               % extended[extpos])
rpm-build 2bd099
        else:
rpm-build 2bd099
            return extpos, None
rpm-build 2bd099
        t = T(j, bias)
rpm-build 2bd099
        result += digit * w
rpm-build 2bd099
        if digit < t:
rpm-build 2bd099
            return extpos, result
rpm-build 2bd099
        w = w * (36 - t)
rpm-build 2bd099
        j += 1
rpm-build 2bd099
rpm-build 2bd099
rpm-build 2bd099
def insertion_sort(base, extended, errors):
rpm-build 2bd099
    """3.2 Insertion unsort coding"""
rpm-build 2bd099
    char = 0x80
rpm-build 2bd099
    pos = -1
rpm-build 2bd099
    bias = 72
rpm-build 2bd099
    extpos = 0
rpm-build 2bd099
    while extpos < len(extended):
rpm-build 2bd099
        newpos, delta = decode_generalized_number(extended, extpos,
rpm-build 2bd099
                                                  bias, errors)
rpm-build 2bd099
        if delta is None:
rpm-build 2bd099
            # There was an error in decoding. We can't continue because
rpm-build 2bd099
            # synchronization is lost.
rpm-build 2bd099
            return base
rpm-build 2bd099
        pos += delta+1
rpm-build 2bd099
        char += pos // (len(base) + 1)
rpm-build 2bd099
        if char > 0x10FFFF:
rpm-build 2bd099
            if errors == "strict":
rpm-build 2bd099
                raise UnicodeError("Invalid character U+%x" % char)
rpm-build 2bd099
            char = ord('?')
rpm-build 2bd099
        pos = pos % (len(base) + 1)
rpm-build 2bd099
        base = base[:pos] + chr(char) + base[pos:]
rpm-build 2bd099
        bias = adapt(delta, (extpos == 0), len(base))
rpm-build 2bd099
        extpos = newpos
rpm-build 2bd099
    return base
rpm-build 2bd099
rpm-build 2bd099
def punycode_decode(text, errors):
rpm-build 2bd099
    if isinstance(text, str):
rpm-build 2bd099
        text = text.encode("ascii")
rpm-build 2bd099
    if isinstance(text, memoryview):
rpm-build 2bd099
        text = bytes(text)
rpm-build 2bd099
    pos = text.rfind(b"-")
rpm-build 2bd099
    if pos == -1:
rpm-build 2bd099
        base = ""
rpm-build 2bd099
        extended = str(text, "ascii").upper()
rpm-build 2bd099
    else:
rpm-build 2bd099
        base = str(text[:pos], "ascii", errors)
rpm-build 2bd099
        extended = str(text[pos+1:], "ascii").upper()
rpm-build 2bd099
    return insertion_sort(base, extended, errors)
rpm-build 2bd099
rpm-build 2bd099
### Codec APIs
rpm-build 2bd099
rpm-build 2bd099
class Codec(codecs.Codec):
rpm-build 2bd099
rpm-build 2bd099
    def encode(self, input, errors='strict'):
rpm-build 2bd099
        res = punycode_encode(input)
rpm-build 2bd099
        return res, len(input)
rpm-build 2bd099
rpm-build 2bd099
    def decode(self, input, errors='strict'):
rpm-build 2bd099
        if errors not in ('strict', 'replace', 'ignore'):
rpm-build 2bd099
            raise UnicodeError("Unsupported error handling "+errors)
rpm-build 2bd099
        res = punycode_decode(input, errors)
rpm-build 2bd099
        return res, len(input)
rpm-build 2bd099
rpm-build 2bd099
class IncrementalEncoder(codecs.IncrementalEncoder):
rpm-build 2bd099
    def encode(self, input, final=False):
rpm-build 2bd099
        return punycode_encode(input)
rpm-build 2bd099
rpm-build 2bd099
class IncrementalDecoder(codecs.IncrementalDecoder):
rpm-build 2bd099
    def decode(self, input, final=False):
rpm-build 2bd099
        if self.errors not in ('strict', 'replace', 'ignore'):
rpm-build 2bd099
            raise UnicodeError("Unsupported error handling "+self.errors)
rpm-build 2bd099
        return punycode_decode(input, self.errors)
rpm-build 2bd099
rpm-build 2bd099
class StreamWriter(Codec,codecs.StreamWriter):
rpm-build 2bd099
    pass
rpm-build 2bd099
rpm-build 2bd099
class StreamReader(Codec,codecs.StreamReader):
rpm-build 2bd099
    pass
rpm-build 2bd099
rpm-build 2bd099
### encodings module API
rpm-build 2bd099
rpm-build 2bd099
def getregentry():
rpm-build 2bd099
    return codecs.CodecInfo(
rpm-build 2bd099
        name='punycode',
rpm-build 2bd099
        encode=Codec().encode,
rpm-build 2bd099
        decode=Codec().decode,
rpm-build 2bd099
        incrementalencoder=IncrementalEncoder,
rpm-build 2bd099
        incrementaldecoder=IncrementalDecoder,
rpm-build 2bd099
        streamwriter=StreamWriter,
rpm-build 2bd099
        streamreader=StreamReader,
rpm-build 2bd099
    )