Blame vendor/golang.org/x/text/unicode/norm/forminfo.go

Packit Service 4d2de5
// Copyright 2011 The Go Authors. All rights reserved.
Packit Service 4d2de5
// Use of this source code is governed by a BSD-style
Packit Service 4d2de5
// license that can be found in the LICENSE file.
Packit Service 4d2de5
Packit Service 4d2de5
package norm
Packit Service 4d2de5
Packit Service 4d2de5
import "encoding/binary"
Packit Service 4d2de5
Packit Service 4d2de5
// This file contains Form-specific logic and wrappers for data in tables.go.
Packit Service 4d2de5
Packit Service 4d2de5
// Rune info is stored in a separate trie per composing form. A composing form
Packit Service 4d2de5
// and its corresponding decomposing form share the same trie.  Each trie maps
Packit Service 4d2de5
// a rune to a uint16. The values take two forms.  For v >= 0x8000:
Packit Service 4d2de5
//   bits
Packit Service 4d2de5
//   15:    1 (inverse of NFD_QC bit of qcInfo)
Packit Service 4d2de5
//   13..7: qcInfo (see below). isYesD is always true (no decompostion).
Packit Service 4d2de5
//    6..0: ccc (compressed CCC value).
Packit Service 4d2de5
// For v < 0x8000, the respective rune has a decomposition and v is an index
Packit Service 4d2de5
// into a byte array of UTF-8 decomposition sequences and additional info and
Packit Service 4d2de5
// has the form:
Packit Service 4d2de5
//    <header> <decomp_byte>* [<tccc> [<lccc>]]
Packit Service 4d2de5
// The header contains the number of bytes in the decomposition (excluding this
Packit Service 4d2de5
// length byte). The two most significant bits of this length byte correspond
Packit Service 4d2de5
// to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
Packit Service 4d2de5
// The byte sequence is followed by a trailing and leading CCC if the values
Packit Service 4d2de5
// for these are not zero.  The value of v determines which ccc are appended
Packit Service 4d2de5
// to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
Packit Service 4d2de5
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
Packit Service 4d2de5
// there is an additional leading ccc. The value of tccc itself is the
Packit Service 4d2de5
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
Packit Service 4d2de5
// are the number of trailing non-starters.
Packit Service 4d2de5
Packit Service 4d2de5
const (
Packit Service 4d2de5
	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
Packit Service 4d2de5
	headerLenMask   = 0x3F // extract the length value from the header byte
Packit Service 4d2de5
	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
Packit Service 4d2de5
)
Packit Service 4d2de5
Packit Service 4d2de5
// Properties provides access to normalization properties of a rune.
Packit Service 4d2de5
type Properties struct {
Packit Service 4d2de5
	pos   uint8  // start position in reorderBuffer; used in composition.go
Packit Service 4d2de5
	size  uint8  // length of UTF-8 encoding of this rune
Packit Service 4d2de5
	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
Packit Service 4d2de5
	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
Packit Service 4d2de5
	nLead uint8  // number of leading non-starters.
Packit Service 4d2de5
	flags qcInfo // quick check flags
Packit Service 4d2de5
	index uint16
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// functions dispatchable per form
Packit Service 4d2de5
type lookupFunc func(b input, i int) Properties
Packit Service 4d2de5
Packit Service 4d2de5
// formInfo holds Form-specific functions and tables.
Packit Service 4d2de5
type formInfo struct {
Packit Service 4d2de5
	form                     Form
Packit Service 4d2de5
	composing, compatibility bool // form type
Packit Service 4d2de5
	info                     lookupFunc
Packit Service 4d2de5
	nextMain                 iterFunc
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
var formTable = []*formInfo{{
Packit Service 4d2de5
	form:          NFC,
Packit Service 4d2de5
	composing:     true,
Packit Service 4d2de5
	compatibility: false,
Packit Service 4d2de5
	info:          lookupInfoNFC,
Packit Service 4d2de5
	nextMain:      nextComposed,
Packit Service 4d2de5
}, {
Packit Service 4d2de5
	form:          NFD,
Packit Service 4d2de5
	composing:     false,
Packit Service 4d2de5
	compatibility: false,
Packit Service 4d2de5
	info:          lookupInfoNFC,
Packit Service 4d2de5
	nextMain:      nextDecomposed,
Packit Service 4d2de5
}, {
Packit Service 4d2de5
	form:          NFKC,
Packit Service 4d2de5
	composing:     true,
Packit Service 4d2de5
	compatibility: true,
Packit Service 4d2de5
	info:          lookupInfoNFKC,
Packit Service 4d2de5
	nextMain:      nextComposed,
Packit Service 4d2de5
}, {
Packit Service 4d2de5
	form:          NFKD,
Packit Service 4d2de5
	composing:     false,
Packit Service 4d2de5
	compatibility: true,
Packit Service 4d2de5
	info:          lookupInfoNFKC,
Packit Service 4d2de5
	nextMain:      nextDecomposed,
Packit Service 4d2de5
}}
Packit Service 4d2de5
Packit Service 4d2de5
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
Packit Service 4d2de5
// unexpected behavior for the user.  For example, in NFD, there is a boundary
Packit Service 4d2de5
// after 'a'.  However, 'a' might combine with modifiers, so from the application's
Packit Service 4d2de5
// perspective it is not a good boundary. We will therefore always use the
Packit Service 4d2de5
// boundaries for the combining variants.
Packit Service 4d2de5
Packit Service 4d2de5
// BoundaryBefore returns true if this rune starts a new segment and
Packit Service 4d2de5
// cannot combine with any rune on the left.
Packit Service 4d2de5
func (p Properties) BoundaryBefore() bool {
Packit Service 4d2de5
	if p.ccc == 0 && !p.combinesBackward() {
Packit Service 4d2de5
		return true
Packit Service 4d2de5
	}
Packit Service 4d2de5
	// We assume that the CCC of the first character in a decomposition
Packit Service 4d2de5
	// is always non-zero if different from info.ccc and that we can return
Packit Service 4d2de5
	// false at this point. This is verified by maketables.
Packit Service 4d2de5
	return false
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// BoundaryAfter returns true if runes cannot combine with or otherwise
Packit Service 4d2de5
// interact with this or previous runes.
Packit Service 4d2de5
func (p Properties) BoundaryAfter() bool {
Packit Service 4d2de5
	// TODO: loosen these conditions.
Packit Service 4d2de5
	return p.isInert()
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// We pack quick check data in 4 bits:
Packit Service 4d2de5
//   5:    Combines forward  (0 == false, 1 == true)
Packit Service 4d2de5
//   4..3: NFC_QC Yes(00), No (10), or Maybe (11)
Packit Service 4d2de5
//   2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
Packit Service 4d2de5
//   1..0: Number of trailing non-starters.
Packit Service 4d2de5
//
Packit Service 4d2de5
// When all 4 bits are zero, the character is inert, meaning it is never
Packit Service 4d2de5
// influenced by normalization.
Packit Service 4d2de5
type qcInfo uint8
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
Packit Service 4d2de5
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
Packit Service 4d2de5
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
Packit Service 4d2de5
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) isInert() bool {
Packit Service 4d2de5
	return p.flags&qcInfoMask == 0 && p.ccc == 0
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) multiSegment() bool {
Packit Service 4d2de5
	return p.index >= firstMulti && p.index < endMulti
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) nLeadingNonStarters() uint8 {
Packit Service 4d2de5
	return p.nLead
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func (p Properties) nTrailingNonStarters() uint8 {
Packit Service 4d2de5
	return uint8(p.flags & 0x03)
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// Decomposition returns the decomposition for the underlying rune
Packit Service 4d2de5
// or nil if there is none.
Packit Service 4d2de5
func (p Properties) Decomposition() []byte {
Packit Service 4d2de5
	// TODO: create the decomposition for Hangul?
Packit Service 4d2de5
	if p.index == 0 {
Packit Service 4d2de5
		return nil
Packit Service 4d2de5
	}
Packit Service 4d2de5
	i := p.index
Packit Service 4d2de5
	n := decomps[i] & headerLenMask
Packit Service 4d2de5
	i++
Packit Service 4d2de5
	return decomps[i : i+uint16(n)]
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// Size returns the length of UTF-8 encoding of the rune.
Packit Service 4d2de5
func (p Properties) Size() int {
Packit Service 4d2de5
	return int(p.size)
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// CCC returns the canonical combining class of the underlying rune.
Packit Service 4d2de5
func (p Properties) CCC() uint8 {
Packit Service 4d2de5
	if p.index >= firstCCCZeroExcept {
Packit Service 4d2de5
		return 0
Packit Service 4d2de5
	}
Packit Service 4d2de5
	return ccc[p.ccc]
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// LeadCCC returns the CCC of the first rune in the decomposition.
Packit Service 4d2de5
// If there is no decomposition, LeadCCC equals CCC.
Packit Service 4d2de5
func (p Properties) LeadCCC() uint8 {
Packit Service 4d2de5
	return ccc[p.ccc]
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// TrailCCC returns the CCC of the last rune in the decomposition.
Packit Service 4d2de5
// If there is no decomposition, TrailCCC equals CCC.
Packit Service 4d2de5
func (p Properties) TrailCCC() uint8 {
Packit Service 4d2de5
	return ccc[p.tccc]
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func buildRecompMap() {
Packit Service 4d2de5
	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
Packit Service 4d2de5
	var buf [8]byte
Packit Service 4d2de5
	for i := 0; i < len(recompMapPacked); i += 8 {
Packit Service 4d2de5
		copy(buf[:], recompMapPacked[i:i+8])
Packit Service 4d2de5
		key := binary.BigEndian.Uint32(buf[:4])
Packit Service 4d2de5
		val := binary.BigEndian.Uint32(buf[4:])
Packit Service 4d2de5
		recompMap[key] = rune(val)
Packit Service 4d2de5
	}
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// Recomposition
Packit Service 4d2de5
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
Packit Service 4d2de5
// This clips off the bits of three entries, but we know this will not
Packit Service 4d2de5
// result in a collision. In the unlikely event that changes to
Packit Service 4d2de5
// UnicodeData.txt introduce collisions, the compiler will catch it.
Packit Service 4d2de5
// Note that the recomposition map for NFC and NFKC are identical.
Packit Service 4d2de5
Packit Service 4d2de5
// combine returns the combined rune or 0 if it doesn't exist.
Packit Service 4d2de5
//
Packit Service 4d2de5
// The caller is responsible for calling
Packit Service 4d2de5
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
Packit Service 4d2de5
func combine(a, b rune) rune {
Packit Service 4d2de5
	key := uint32(uint16(a))<<16 + uint32(uint16(b))
Packit Service 4d2de5
	if recompMap == nil {
Packit Service 4d2de5
		panic("caller error") // see func comment
Packit Service 4d2de5
	}
Packit Service 4d2de5
	return recompMap[key]
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func lookupInfoNFC(b input, i int) Properties {
Packit Service 4d2de5
	v, sz := b.charinfoNFC(i)
Packit Service 4d2de5
	return compInfo(v, sz)
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
func lookupInfoNFKC(b input, i int) Properties {
Packit Service 4d2de5
	v, sz := b.charinfoNFKC(i)
Packit Service 4d2de5
	return compInfo(v, sz)
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// Properties returns properties for the first rune in s.
Packit Service 4d2de5
func (f Form) Properties(s []byte) Properties {
Packit Service 4d2de5
	if f == NFC || f == NFD {
Packit Service 4d2de5
		return compInfo(nfcData.lookup(s))
Packit Service 4d2de5
	}
Packit Service 4d2de5
	return compInfo(nfkcData.lookup(s))
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// PropertiesString returns properties for the first rune in s.
Packit Service 4d2de5
func (f Form) PropertiesString(s string) Properties {
Packit Service 4d2de5
	if f == NFC || f == NFD {
Packit Service 4d2de5
		return compInfo(nfcData.lookupString(s))
Packit Service 4d2de5
	}
Packit Service 4d2de5
	return compInfo(nfkcData.lookupString(s))
Packit Service 4d2de5
}
Packit Service 4d2de5
Packit Service 4d2de5
// compInfo converts the information contained in v and sz
Packit Service 4d2de5
// to a Properties.  See the comment at the top of the file
Packit Service 4d2de5
// for more information on the format.
Packit Service 4d2de5
func compInfo(v uint16, sz int) Properties {
Packit Service 4d2de5
	if v == 0 {
Packit Service 4d2de5
		return Properties{size: uint8(sz)}
Packit Service 4d2de5
	} else if v >= 0x8000 {
Packit Service 4d2de5
		p := Properties{
Packit Service 4d2de5
			size:  uint8(sz),
Packit Service 4d2de5
			ccc:   uint8(v),
Packit Service 4d2de5
			tccc:  uint8(v),
Packit Service 4d2de5
			flags: qcInfo(v >> 8),
Packit Service 4d2de5
		}
Packit Service 4d2de5
		if p.ccc > 0 || p.combinesBackward() {
Packit Service 4d2de5
			p.nLead = uint8(p.flags & 0x3)
Packit Service 4d2de5
		}
Packit Service 4d2de5
		return p
Packit Service 4d2de5
	}
Packit Service 4d2de5
	// has decomposition
Packit Service 4d2de5
	h := decomps[v]
Packit Service 4d2de5
	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
Packit Service 4d2de5
	p := Properties{size: uint8(sz), flags: f, index: v}
Packit Service 4d2de5
	if v >= firstCCC {
Packit Service 4d2de5
		v += uint16(h&headerLenMask) + 1
Packit Service 4d2de5
		c := decomps[v]
Packit Service 4d2de5
		p.tccc = c >> 2
Packit Service 4d2de5
		p.flags |= qcInfo(c & 0x3)
Packit Service 4d2de5
		if v >= firstLeadingCCC {
Packit Service 4d2de5
			p.nLead = c & 0x3
Packit Service 4d2de5
			if v >= firstStarterWithNLead {
Packit Service 4d2de5
				// We were tricked. Remove the decomposition.
Packit Service 4d2de5
				p.flags &= 0x03
Packit Service 4d2de5
				p.index = 0
Packit Service 4d2de5
				return p
Packit Service 4d2de5
			}
Packit Service 4d2de5
			p.ccc = decomps[v+1]
Packit Service 4d2de5
		}
Packit Service 4d2de5
	}
Packit Service 4d2de5
	return p
Packit Service 4d2de5
}