Blob Blame History Raw
/*
    Copyright (C) 2011  ABRT team
    Copyright (C) 2011  RedHat Inc

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "internal_libreport.h"

char *sanitize_utf8(const char *src, uint32_t control_chars_to_sanitize)
{
    const char *initial_src = src;
    char *sanitized = NULL;
    unsigned sanitized_pos = 0;

    while (*src)
    {
        int bytes = 0;

        unsigned c = (unsigned char) *src;
        if (c <= 0x7f)
        {
            if (c < 32 && (((uint32_t)1 << c) & control_chars_to_sanitize))
                goto bad_byte;
            bytes = 1;
            goto good_byte;
        }

        /* Unicode -> utf8: */
        /* 80-7FF -> 110yyyxx 10xxxxxx */
        /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */
        /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */
        /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
        /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */
        do {
            c <<= 1;
            bytes++;
        } while ((c & 0x80) && bytes < 6);
        if (bytes == 1)
        {
            /* A bare "continuation" byte. Say, 80 */
            goto bad_byte;
        }

        c = (uint8_t)(c) >> bytes;
        {
            const char *pp = src;
            int cnt = bytes;
            while (--cnt)
            {
                unsigned ch = (unsigned char) *++pp;
                if ((ch & 0xc0) != 0x80) /* Missing "continuation" byte. Example: e0 80 */
                {
                    goto bad_byte;
                }
                c = (c << 6) + (ch & 0x3f);
            }
        }
        /* TODO */
        /* Need to check that c isn't produced by overlong encoding */
        /* Example: 11000000 10000000 converts to NUL */
        /* 11110000 10000000 10000100 10000000 converts to 0x100 */
        /* correct encoding: 11000100 10000000 */
        if (c <= 0x7f) /* crude check: only catches bad encodings which map to chars <= 7f */
        {
            goto bad_byte;
        }

 good_byte:
        while (--bytes >= 0)
        {
            c = (unsigned char) *src++;
            if (sanitized)
            {
                sanitized = (char*) xrealloc(sanitized, sanitized_pos + 2);
                sanitized[sanitized_pos++] = c;
                sanitized[sanitized_pos] = '\0';
            }
        }
        continue;

 bad_byte:
        if (!sanitized)
        {
            sanitized_pos = src - initial_src;
            sanitized = xstrndup(initial_src, sanitized_pos);
        }
        sanitized = (char*) xrealloc(sanitized, sanitized_pos + 5);
        sanitized[sanitized_pos++] = '[';
        c = (unsigned char) *src++;
        sanitized[sanitized_pos++] = "0123456789ABCDEF"[c >> 4];
        sanitized[sanitized_pos++] = "0123456789ABCDEF"[c & 0xf];
        sanitized[sanitized_pos++] = ']';
        sanitized[sanitized_pos] = '\0';
    }

    if (sanitized)
        log_info("note: bad utf8, converted '%s' -> '%s'", initial_src, sanitized);

    return sanitized; /* usually NULL: the whole string is ok */
}