Blame src/util/support/utf8_conv.c

Packit Service 99d1c0
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
Packit Service 99d1c0
/* util/support/utf8_conv.c */
Packit Service 99d1c0
/*
Packit Service 99d1c0
 * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
Packit Service 99d1c0
 * All Rights Reserved.
Packit Service 99d1c0
 *
Packit Service 99d1c0
 * Export of this software from the United States of America may
Packit Service 99d1c0
 *   require a specific license from the United States Government.
Packit Service 99d1c0
 *   It is the responsibility of any person or organization contemplating
Packit Service 99d1c0
 *   export to obtain such a license before exporting.
Packit Service 99d1c0
 *
Packit Service 99d1c0
 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
Packit Service 99d1c0
 * distribute this software and its documentation for any purpose and
Packit Service 99d1c0
 * without fee is hereby granted, provided that the above copyright
Packit Service 99d1c0
 * notice appear in all copies and that both that copyright notice and
Packit Service 99d1c0
 * this permission notice appear in supporting documentation, and that
Packit Service 99d1c0
 * the name of M.I.T. not be used in advertising or publicity pertaining
Packit Service 99d1c0
 * to distribution of the software without specific, written prior
Packit Service 99d1c0
 * permission.  Furthermore if you modify this software you must label
Packit Service 99d1c0
 * your software as modified software and not distribute it in such a
Packit Service 99d1c0
 * fashion that it might be confused with the original M.I.T. software.
Packit Service 99d1c0
 * M.I.T. makes no representations about the suitability of
Packit Service 99d1c0
 * this software for any purpose.  It is provided "as is" without express
Packit Service 99d1c0
 * or implied warranty.
Packit Service 99d1c0
 */
Packit Service 99d1c0
/*
Packit Service 99d1c0
 * Copyright 1998-2008 The OpenLDAP Foundation.
Packit Service 99d1c0
 * All rights reserved.
Packit Service 99d1c0
 *
Packit Service 99d1c0
 * Redistribution and use in source and binary forms, with or without
Packit Service 99d1c0
 * modification, are permitted only as authorized by the OpenLDAP
Packit Service 99d1c0
 * Public License.
Packit Service 99d1c0
 *
Packit Service 99d1c0
 * A copy of this license is available in the file LICENSE in the
Packit Service 99d1c0
 * top-level directory of the distribution or, alternatively, at
Packit Service 99d1c0
 * <https://www.OpenLDAP.org/license.html>.
Packit Service 99d1c0
 */
Packit Service 99d1c0
/* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
Packit Service 99d1c0
 *
Packit Service 99d1c0
 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
Packit Service 99d1c0
 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
Packit Service 99d1c0
 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
Packit Service 99d1c0
 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
Packit Service 99d1c0
 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
Packit Service 99d1c0
 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
Packit Service 99d1c0
 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
Packit Service 99d1c0
 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
Packit Service 99d1c0
 */
Packit Service 99d1c0
Packit Service 99d1c0
/* This work is based on OpenLDAP Software <https://www.openldap.org/>. */
Packit Service 99d1c0
Packit Service 99d1c0
/*
Packit Service 99d1c0
 * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
Packit Service 99d1c0
 * character in either two or four bytes.  Characters in the Basic Multilingual
Packit Service 99d1c0
 * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
Packit Service 99d1c0
 * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
Packit Service 99d1c0
 * surrogate and a low surrogate, each containing ten bits of the character
Packit Service 99d1c0
 * value, and encoded in four bytes.
Packit Service 99d1c0
 */
Packit Service 99d1c0
Packit Service 99d1c0
#include "k5-platform.h"
Packit Service 99d1c0
#include "k5-utf8.h"
Packit Service 99d1c0
#include "k5-buf.h"
Packit Service 99d1c0
#include "k5-input.h"
Packit Service 99d1c0
#include "supp-int.h"
Packit Service 99d1c0
Packit Service 99d1c0
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
Packit Service 99d1c0
Packit Service 99d1c0
/* A high surrogate is ten bits masked with 0xD800. */
Packit Service 99d1c0
#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
Packit Service 99d1c0
Packit Service 99d1c0
/* A low surrogate is ten bits masked with 0xDC00. */
Packit Service 99d1c0
#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
Packit Service 99d1c0
Packit Service 99d1c0
/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
Packit Service 99d1c0
 * value. */
Packit Service 99d1c0
#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
Packit Service 99d1c0
#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
Packit Service 99d1c0
Packit Service 99d1c0
/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
Packit Service 99d1c0
 * surrogate value. */
Packit Service 99d1c0
#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
Packit Service 99d1c0
Packit Service 99d1c0
/* Characters in the Supplementary Planes have a base value subtracted from
Packit Service 99d1c0
 * their code points to form a 20-bit value; ten bits go in each surrogate. */
Packit Service 99d1c0
#define BASE 0x10000
Packit Service 99d1c0
#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
Packit Service 99d1c0
#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
Packit Service 99d1c0
#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
Packit Service 99d1c0
Packit Service 99d1c0
int
Packit Service 99d1c0
k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
Packit Service 99d1c0
{
Packit Service 99d1c0
    struct k5buf buf;
Packit Service 99d1c0
    krb5_ucs4 ch;
Packit Service 99d1c0
    size_t chlen, i;
Packit Service 99d1c0
Packit Service 99d1c0
    *utf16_out = NULL;
Packit Service 99d1c0
    *nbytes_out = 0;
Packit Service 99d1c0
Packit Service 99d1c0
    /* UTF-16 conversion is used for RC4 string-to-key, so treat this data as
Packit Service 99d1c0
     * sensitive. */
Packit Service 99d1c0
    k5_buf_init_dynamic_zap(&buf;;
Packit Service 99d1c0
Packit Service 99d1c0
    /* Examine next UTF-8 character. */
Packit Service 99d1c0
    while (*utf8 != '\0') {
Packit Service 99d1c0
        /* Get UTF-8 sequence length from first byte. */
Packit Service 99d1c0
        chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
Packit Service 99d1c0
        if (chlen == 0)
Packit Service 99d1c0
            goto invalid;
Packit Service 99d1c0
Packit Service 99d1c0
        /* First byte minus length tag */
Packit Service 99d1c0
        ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
Packit Service 99d1c0
Packit Service 99d1c0
        for (i = 1; i < chlen; i++) {
Packit Service 99d1c0
            /* Subsequent bytes must start with 10. */
Packit Service 99d1c0
            if ((utf8[i] & 0xc0) != 0x80)
Packit Service 99d1c0
                goto invalid;
Packit Service 99d1c0
Packit Service 99d1c0
            /* 6 bits of data in each subsequent byte */
Packit Service 99d1c0
            ch <<= 6;
Packit Service 99d1c0
            ch |= (krb5_ucs4)(utf8[i] & 0x3f);
Packit Service 99d1c0
        }
Packit Service 99d1c0
        if (!IS_VALID_UNICODE(ch))
Packit Service 99d1c0
            goto invalid;
Packit Service 99d1c0
Packit Service 99d1c0
        /* Characters in the basic multilingual plane are encoded using two
Packit Service 99d1c0
         * bytes; other characters are encoded using four bytes. */
Packit Service 99d1c0
        if (IS_BMP(ch)) {
Packit Service 99d1c0
            k5_buf_add_uint16_le(&buf, ch);
Packit Service 99d1c0
        } else {
Packit Service 99d1c0
            /* 0x10000 is subtracted from ch; then the high ten bits plus
Packit Service 99d1c0
             * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
Packit Service 99d1c0
            k5_buf_add_uint16_le(&buf, HIGH_SURROGATE(ch));
Packit Service 99d1c0
            k5_buf_add_uint16_le(&buf, LOW_SURROGATE(ch));
Packit Service 99d1c0
        }
Packit Service 99d1c0
Packit Service 99d1c0
        /* Move to next UTF-8 character. */
Packit Service 99d1c0
        utf8 += chlen;
Packit Service 99d1c0
    }
Packit Service 99d1c0
Packit Service 99d1c0
    *utf16_out = buf.data;
Packit Service 99d1c0
    *nbytes_out = buf.len;
Packit Service 99d1c0
    return 0;
Packit Service 99d1c0
Packit Service 99d1c0
invalid:
Packit Service 99d1c0
    k5_buf_free(&buf;;
Packit Service 99d1c0
    return EINVAL;
Packit Service 99d1c0
}
Packit Service 99d1c0
Packit Service 99d1c0
int
Packit Service 99d1c0
k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
Packit Service 99d1c0
{
Packit Service 99d1c0
    struct k5buf buf;
Packit Service 99d1c0
    struct k5input in;
Packit Service 99d1c0
    uint16_t ch1, ch2;
Packit Service 99d1c0
    krb5_ucs4 ch;
Packit Service 99d1c0
    size_t chlen;
Packit Service 99d1c0
    void *p;
Packit Service 99d1c0
Packit Service 99d1c0
    *utf8_out = NULL;
Packit Service 99d1c0
Packit Service 99d1c0
    if (nbytes % 2 != 0)
Packit Service 99d1c0
        return EINVAL;
Packit Service 99d1c0
Packit Service 99d1c0
    k5_buf_init_dynamic(&buf;;
Packit Service 99d1c0
    k5_input_init(&in, utf16bytes, nbytes);
Packit Service 99d1c0
    while (!in.status && in.len > 0) {
Packit Service 99d1c0
        /* Get the next character or high surrogate.  A low surrogate without a
Packit Service 99d1c0
         * preceding high surrogate is invalid. */
Packit Service 99d1c0
        ch1 = k5_input_get_uint16_le(&in);
Packit Service 99d1c0
        if (IS_LOW_SURROGATE(ch1))
Packit Service 99d1c0
            goto invalid;
Packit Service 99d1c0
        if (IS_HIGH_SURROGATE(ch1)) {
Packit Service 99d1c0
            /* Get the low surrogate and combine the pair. */
Packit Service 99d1c0
            ch2 = k5_input_get_uint16_le(&in);
Packit Service 99d1c0
            if (!IS_LOW_SURROGATE(ch2))
Packit Service 99d1c0
                goto invalid;
Packit Service 99d1c0
            ch = COMPOSE(ch1, ch2);
Packit Service 99d1c0
        } else {
Packit Service 99d1c0
            ch = ch1;
Packit Service 99d1c0
        }
Packit Service 99d1c0
Packit Service 99d1c0
        chlen = krb5int_ucs4_to_utf8(ch, NULL);
Packit Service 99d1c0
        p = k5_buf_get_space(&buf, chlen);
Packit Service 99d1c0
        if (p == NULL)
Packit Service 99d1c0
            return ENOMEM;
Packit Service 99d1c0
        (void)krb5int_ucs4_to_utf8(ch, p);
Packit Service 99d1c0
    }
Packit Service 99d1c0
Packit Service 99d1c0
    if (in.status)
Packit Service 99d1c0
        goto invalid;
Packit Service 99d1c0
Packit Service 99d1c0
    *utf8_out = buf.data;
Packit Service 99d1c0
    return 0;
Packit Service 99d1c0
Packit Service 99d1c0
invalid:
Packit Service 99d1c0
    k5_buf_free(&buf;;
Packit Service 99d1c0
    return EINVAL;
Packit Service 99d1c0
}