Blame lib/unistring/uninorm.in.h

Packit aea12f
/* Normalization forms (composition and decomposition) of Unicode strings.
Packit Service 991b93
   Copyright (C) 2001-2002, 2009-2020 Free Software Foundation, Inc.
Packit aea12f
   Written by Bruno Haible <bruno@clisp.org>, 2009.
Packit aea12f
Packit aea12f
   This program is free software: you can redistribute it and/or
Packit aea12f
   modify it under the terms of either:
Packit aea12f
Packit aea12f
     * the GNU Lesser General Public License as published by the Free
Packit aea12f
       Software Foundation; either version 3 of the License, or (at your
Packit aea12f
       option) any later version.
Packit aea12f
Packit aea12f
   or
Packit aea12f
Packit aea12f
     * the GNU General Public License as published by the Free
Packit aea12f
       Software Foundation; either version 2 of the License, or (at your
Packit aea12f
       option) any later version.
Packit aea12f
Packit aea12f
   or both in parallel, as here.
Packit aea12f
   This program is distributed in the hope that it will be useful,
Packit aea12f
   but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit aea12f
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit aea12f
   Lesser General Public License for more details.
Packit aea12f
Packit aea12f
   You should have received a copy of the GNU Lesser General Public License
Packit aea12f
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
Packit aea12f
Packit aea12f
#ifndef _UNINORM_H
Packit aea12f
#define _UNINORM_H
Packit aea12f
Packit aea12f
/* Get size_t.  */
Packit aea12f
#include <stddef.h>
Packit aea12f
Packit aea12f
#include "unitypes.h"
Packit aea12f
Packit aea12f
Packit aea12f
#ifdef __cplusplus
Packit aea12f
extern "C" {
Packit aea12f
#endif
Packit aea12f
Packit aea12f
Packit aea12f
/* Conventions:
Packit aea12f
Packit aea12f
   All functions prefixed with u8_ operate on UTF-8 encoded strings.
Packit aea12f
   Their unit is an uint8_t (1 byte).
Packit aea12f
Packit aea12f
   All functions prefixed with u16_ operate on UTF-16 encoded strings.
Packit aea12f
   Their unit is an uint16_t (a 2-byte word).
Packit aea12f
Packit aea12f
   All functions prefixed with u32_ operate on UCS-4 encoded strings.
Packit aea12f
   Their unit is an uint32_t (a 4-byte word).
Packit aea12f
Packit aea12f
   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
Packit aea12f
   n units.
Packit aea12f
Packit aea12f
   Functions returning a string result take a (resultbuf, lengthp) argument
Packit aea12f
   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
Packit aea12f
   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
Packit aea12f
   allocated string is returned.  In both cases, *lengthp is set to the
Packit aea12f
   length (number of units) of the returned string.  In case of error,
Packit aea12f
   NULL is returned and errno is set.  */
Packit aea12f
Packit aea12f
Packit aea12f
enum
Packit aea12f
{
Packit aea12f
  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
Packit aea12f
  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
Packit aea12f
  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
Packit aea12f
  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
Packit aea12f
  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
Packit aea12f
  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
Packit aea12f
  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
Packit aea12f
  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
Packit aea12f
  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
Packit aea12f
  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
Packit aea12f
  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
Packit aea12f
  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
Packit aea12f
  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
Packit aea12f
  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
Packit aea12f
  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
Packit aea12f
  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
Packit aea12f
  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
Packit aea12f
};
Packit aea12f
Packit aea12f
/* Maximum size of decomposition of a single Unicode character.  */
Packit aea12f
#define UC_DECOMPOSITION_MAX_LENGTH 32
Packit aea12f
Packit aea12f
/* Return the character decomposition mapping of a Unicode character.
Packit aea12f
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
Packit aea12f
   ucs_t elements.
Packit aea12f
   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
Packit aea12f
   filled and N is returned.  Otherwise -1 is returned.  */
Packit aea12f
extern int
Packit aea12f
       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
Packit aea12f
Packit aea12f
/* Return the canonical character decomposition mapping of a Unicode character.
Packit aea12f
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
Packit aea12f
   ucs_t elements.
Packit aea12f
   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
Packit aea12f
   returned.  Otherwise -1 is returned.  */
Packit aea12f
extern int
Packit aea12f
       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
Packit aea12f
Packit aea12f
Packit aea12f
/* Attempt to combine the Unicode characters uc1, uc2.
Packit aea12f
   uc1 is known to have canonical combining class 0.
Packit aea12f
   Return the combination of uc1 and uc2, if it exists.
Packit aea12f
   Return 0 otherwise.
Packit aea12f
   Not all decompositions can be recombined using this function.  See the
Packit aea12f
   Unicode file CompositionExclusions.txt for details.  */
Packit aea12f
extern ucs4_t
Packit aea12f
       uc_composition (ucs4_t uc1, ucs4_t uc2)
Packit aea12f
       _UC_ATTRIBUTE_CONST;
Packit aea12f
Packit aea12f
Packit aea12f
/* An object of type uninorm_t denotes a Unicode normalization form.  */
Packit aea12f
struct unicode_normalization_form;
Packit aea12f
typedef const struct unicode_normalization_form *uninorm_t;
Packit aea12f
Packit aea12f
/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
Packit aea12f
extern const struct unicode_normalization_form uninorm_nfd;
Packit aea12f
#define UNINORM_NFD (&uninorm_nfd)
Packit aea12f
Packit aea12f
/* UNINORM_NFC: Normalization form C: canonical decomposition, then
Packit aea12f
   canonical composition.  */
Packit aea12f
extern const struct unicode_normalization_form uninorm_nfc;
Packit aea12f
#define UNINORM_NFC (&uninorm_nfc)
Packit aea12f
Packit aea12f
/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
Packit aea12f
extern const struct unicode_normalization_form uninorm_nfkd;
Packit aea12f
#define UNINORM_NFKD (&uninorm_nfkd)
Packit aea12f
Packit aea12f
/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
Packit aea12f
   canonical composition.  */
Packit aea12f
extern const struct unicode_normalization_form uninorm_nfkc;
Packit aea12f
#define UNINORM_NFKC (&uninorm_nfkc)
Packit aea12f
Packit aea12f
/* Test whether a normalization form does compatibility decomposition.  */
Packit aea12f
#define uninorm_is_compat_decomposing(nf) \
Packit aea12f
  ((* (const unsigned int *) (nf) >> 0) & 1)
Packit aea12f
Packit aea12f
/* Test whether a normalization form includes canonical composition.  */
Packit aea12f
#define uninorm_is_composing(nf) \
Packit aea12f
  ((* (const unsigned int *) (nf) >> 1) & 1)
Packit aea12f
Packit aea12f
/* Return the decomposing variant of a normalization form.
Packit aea12f
   This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
Packit aea12f
extern uninorm_t
Packit aea12f
       uninorm_decomposing_form (uninorm_t nf)
Packit aea12f
       _UC_ATTRIBUTE_PURE;
Packit aea12f
Packit aea12f
Packit aea12f
/* Return the specified normalization form of a string.  */
Packit aea12f
extern uint8_t *
Packit aea12f
       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
Packit Service 991b93
                     uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
Packit aea12f
extern uint16_t *
Packit aea12f
       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
Packit Service 991b93
                      uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
Packit aea12f
extern uint32_t *
Packit aea12f
       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
Packit Service 991b93
                      uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
Packit aea12f
Packit aea12f
Packit aea12f
/* Compare S1 and S2, ignoring differences in normalization.
Packit aea12f
   NF must be either UNINORM_NFD or UNINORM_NFKD.
Packit aea12f
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
Packit aea12f
   return 0.  Upon failure, return -1 with errno set.  */
Packit aea12f
extern int
Packit aea12f
       u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
Packit aea12f
                   uninorm_t nf, int *resultp);
Packit aea12f
extern int
Packit aea12f
       u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
Packit aea12f
                    uninorm_t nf, int *resultp);
Packit aea12f
extern int
Packit aea12f
       u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
Packit aea12f
                    uninorm_t nf, int *resultp);
Packit aea12f
Packit aea12f
Packit aea12f
/* Converts the string S of length N to a NUL-terminated byte sequence, in such
Packit aea12f
   a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
Packit aea12f
   equivalent to comparing S1 and S2 with uN_normcoll().
Packit aea12f
   NF must be either UNINORM_NFC or UNINORM_NFKC.  */
Packit aea12f
extern char *
Packit aea12f
       u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
Packit aea12f
                    char *resultbuf, size_t *lengthp);
Packit aea12f
extern char *
Packit aea12f
       u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
Packit aea12f
                     char *resultbuf, size_t *lengthp);
Packit aea12f
extern char *
Packit aea12f
       u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
Packit aea12f
                     char *resultbuf, size_t *lengthp);
Packit aea12f
Packit aea12f
Packit aea12f
/* Compare S1 and S2, ignoring differences in normalization, using the
Packit aea12f
   collation rules of the current locale.
Packit aea12f
   NF must be either UNINORM_NFC or UNINORM_NFKC.
Packit aea12f
   If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
Packit aea12f
   return 0.  Upon failure, return -1 with errno set.  */
Packit aea12f
extern int
Packit aea12f
       u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
Packit aea12f
                    uninorm_t nf, int *resultp);
Packit aea12f
extern int
Packit aea12f
       u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
Packit aea12f
                     uninorm_t nf, int *resultp);
Packit aea12f
extern int
Packit aea12f
       u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
Packit aea12f
                     uninorm_t nf, int *resultp);
Packit aea12f
Packit aea12f
Packit aea12f
/* Normalization of a stream of Unicode characters.
Packit aea12f
Packit aea12f
   A "stream of Unicode characters" is essentially a function that accepts an
Packit aea12f
   ucs4_t argument repeatedly, optionally combined with a function that
Packit aea12f
   "flushes" the stream.  */
Packit aea12f
Packit aea12f
/* Data type of a stream of Unicode characters that normalizes its input
Packit aea12f
   according to a given normalization form and passes the normalized character
Packit aea12f
   sequence to the encapsulated stream of Unicode characters.  */
Packit aea12f
struct uninorm_filter;
Packit aea12f
Packit aea12f
/* Create and return a normalization filter for Unicode characters.
Packit aea12f
   The pair (stream_func, stream_data) is the encapsulated stream.
Packit aea12f
   stream_func (stream_data, uc) receives the Unicode character uc
Packit aea12f
   and returns 0 if successful, or -1 with errno set upon failure.
Packit aea12f
   Return the new filter, or NULL with errno set upon failure.  */
Packit aea12f
extern struct uninorm_filter *
Packit aea12f
       uninorm_filter_create (uninorm_t nf,
Packit aea12f
                              int (*stream_func) (void *stream_data, ucs4_t uc),
Packit aea12f
                              void *stream_data);
Packit aea12f
Packit aea12f
/* Stuff a Unicode character into a normalizing filter.
Packit aea12f
   Return 0 if successful, or -1 with errno set upon failure.  */
Packit aea12f
extern int
Packit aea12f
       uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
Packit aea12f
Packit aea12f
/* Bring data buffered in the filter to its destination, the encapsulated
Packit aea12f
   stream.
Packit aea12f
   Return 0 if successful, or -1 with errno set upon failure.
Packit aea12f
   Note! If after calling this function, additional characters are written
Packit aea12f
   into the filter, the resulting character sequence in the encapsulated stream
Packit aea12f
   will not necessarily be normalized.  */
Packit aea12f
extern int
Packit aea12f
       uninorm_filter_flush (struct uninorm_filter *filter);
Packit aea12f
Packit aea12f
/* Bring data buffered in the filter to its destination, the encapsulated
Packit aea12f
   stream, then close and free the filter.
Packit aea12f
   Return 0 if successful, or -1 with errno set upon failure.  */
Packit aea12f
extern int
Packit aea12f
       uninorm_filter_free (struct uninorm_filter *filter);
Packit aea12f
Packit aea12f
Packit aea12f
#ifdef __cplusplus
Packit aea12f
}
Packit aea12f
#endif
Packit aea12f
Packit aea12f
Packit aea12f
#endif /* _UNINORM_H */