/* * Copyright © 2014 Canonical Limited * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, see . * * Author: Ryan Lortie */ #include #include "gstrfuncs.h" #include #include #include #include struct mapping_entry { guint16 src; guint16 ascii; }; struct mapping_range { guint16 start; guint16 length; }; struct locale_entry { guint8 name_offset; guint8 item_id; }; #include "gtranslit-data.h" #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded) #define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1) #if G_BYTE_ORDER == G_BIG_ENDIAN #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1)) #else #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded)) #endif static const gchar * lookup_in_item (guint item_id, const gunichar *key, gint *result_len, gint *key_consumed); static gint compare_mapping_entry (gconstpointer user_data, gconstpointer data) { const struct mapping_entry *entry = data; const gunichar *key = user_data; gunichar src_0; G_STATIC_ASSERT(MAX_KEY_SIZE == 2); src_0 = get_src_char (src_table, entry->src, 0); if (key[0] > src_0) return 1; else if (key[0] < src_0) return -1; if (get_length (entry->src) > 1) { gunichar src_1; src_1 = get_src_char (src_table, entry->src, 1); if (key[1] > src_1) return 1; else if (key[1] < src_1) return -1; } else if (key[1]) return 1; return 0; } static const gchar * lookup_in_mapping (const struct mapping_entry *mapping, gint mapping_size, const gunichar *key, gint *result_len, gint *key_consumed) { const struct mapping_entry *hit; hit = bsearch (key, mapping, mapping_size, sizeof (struct mapping_entry), compare_mapping_entry); if (hit == NULL) return NULL; *key_consumed = get_length (hit->src); *result_len = get_length (hit->ascii); return get_ascii_item(ascii_table, hit->ascii); } static const gchar * lookup_in_chain (const guint8 *chain, const gunichar *key, gint *result_len, gint *key_consumed) { const gchar *result; while (*chain != 0xff) { result = lookup_in_item (*chain, key, result_len, key_consumed); if (result) return result; chain++; } return NULL; } static const gchar * lookup_in_item (guint item_id, const gunichar *key, gint *result_len, gint *key_consumed) { if (item_id & 0x80) { const guint8 *chain = chains_table + chain_starts[item_id & 0x7f]; return lookup_in_chain (chain, key, result_len, key_consumed); } else { const struct mapping_range *range = &mapping_ranges[item_id]; return lookup_in_mapping (mappings_table + range->start, range->length, key, result_len, key_consumed); } } static gint compare_locale_entry (gconstpointer user_data, gconstpointer data) { const struct locale_entry *entry = data; const gchar *key = user_data; return strcmp (key, &locale_names[entry->name_offset]); } static gboolean lookup_item_id_for_one_locale (const gchar *key, guint *item_id) { const struct locale_entry *hit; hit = bsearch (key, locale_index, G_N_ELEMENTS (locale_index), sizeof (struct locale_entry), compare_locale_entry); if (hit == NULL) return FALSE; *item_id = hit->item_id; return TRUE; } static guint lookup_item_id_for_locale (const gchar *locale) { gchar key[MAX_LOCALE_NAME + 1]; const gchar *language; guint language_len; const gchar *territory = NULL; guint territory_len = 0; const gchar *modifier = NULL; guint modifier_len = 0; const gchar *next_char; guint id; /* As per POSIX, a valid locale looks like: * * language[_territory][.codeset][@modifier] */ language = locale; language_len = strcspn (language, "_.@"); next_char = language + language_len; if (*next_char == '_') { territory = next_char; territory_len = strcspn (territory + 1, "_.@") + 1; next_char = territory + territory_len; } if (*next_char == '.') { const gchar *codeset; guint codeset_len; codeset = next_char; codeset_len = strcspn (codeset + 1, "_.@") + 1; next_char = codeset + codeset_len; } if (*next_char == '@') { modifier = next_char; modifier_len = strcspn (modifier + 1, "_.@") + 1; next_char = modifier + modifier_len; } /* What madness is this? */ if (language_len == 0 || *next_char) return default_item_id; /* We are not interested in codeset. * * For this locale: * * aa_BB@cc * * try in this order: * * Note: we have no locales of the form aa_BB@cc in the database. * * 1. aa@cc * 2. aa_BB * 3. aa */ /* 1. */ if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME) { memcpy (key, language, language_len); memcpy (key + language_len, modifier, modifier_len); key[language_len + modifier_len] = '\0'; if (lookup_item_id_for_one_locale (key, &id)) return id; } /* 2. */ if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME) { memcpy (key, language, language_len); memcpy (key + language_len, territory, territory_len); key[language_len + territory_len] = '\0'; if (lookup_item_id_for_one_locale (key, &id)) return id; } /* 3. */ if (language_len <= MAX_LOCALE_NAME) { memcpy (key, language, language_len); key[language_len] = '\0'; if (lookup_item_id_for_one_locale (key, &id)) return id; } return default_item_id; } static guint get_default_item_id (void) { static guint item_id; static gboolean done; /* Doesn't need to be locked -- no harm in doing it twice. */ if (!done) { const gchar *locale; locale = setlocale (LC_CTYPE, NULL); item_id = lookup_item_id_for_locale (locale); done = TRUE; } return item_id; } /** * g_str_to_ascii: * @str: a string, in UTF-8 * @from_locale: (nullable): the source locale, if known * * Transliterate @str to plain ASCII. * * For best results, @str should be in composed normalised form. * * This function performs a reasonably good set of character * replacements. The particular set of replacements that is done may * change by version or even by runtime environment. * * If the source language of @str is known, it can used to improve the * accuracy of the translation by passing it as @from_locale. It should * be a valid POSIX locale string (of the form * "language[_territory][.codeset][@modifier]"). * * If @from_locale is %NULL then the current locale is used. * * If you want to do translation for no specific locale, and you want it * to be done independently of the currently locale, specify "C" for * @from_locale. * * Returns: a string in plain ASCII * * Since: 2.40 **/ gchar * g_str_to_ascii (const gchar *str, const gchar *from_locale) { GString *result; guint item_id; g_return_val_if_fail (str != NULL, NULL); if (g_str_is_ascii (str)) return g_strdup (str); if (from_locale) item_id = lookup_item_id_for_locale (from_locale); else item_id = get_default_item_id (); result = g_string_sized_new (strlen (str)); while (*str) { /* We only need to transliterate non-ASCII values... */ if (*str & 0x80) { gunichar key[MAX_KEY_SIZE]; const gchar *r; gint consumed; gint r_len; gunichar c; G_STATIC_ASSERT(MAX_KEY_SIZE == 2); c = g_utf8_get_char (str); /* This is where it gets evil... * * We know that MAX_KEY_SIZE is 2. We also know that we * only want to try another character if it's non-ascii. */ str = g_utf8_next_char (str); key[0] = c; if (*str & 0x80) key[1] = g_utf8_get_char (str); else key[1] = 0; r = lookup_in_item (item_id, key, &r_len, &consumed); /* If we failed to map two characters, try again with one. * * gconv behaviour is a bit weird here -- it seems to * depend in the randomness of the binary search and the * size of the input buffer as to what result we get here. * * Doing it this way is more work, but should be * more-correct. */ if (r == NULL && key[1]) { key[1] = 0; r = lookup_in_item (item_id, key, &r_len, &consumed); } if (r != NULL) { g_string_append_len (result, r, r_len); if (consumed == 2) /* If it took both then skip again */ str = g_utf8_next_char (str); } else /* no match found */ g_string_append_c (result, '?'); } else if (*str & 0x80) /* Out-of-range non-ASCII case */ { g_string_append_c (result, '?'); str = g_utf8_next_char (str); } else /* ASCII case */ g_string_append_c (result, *str++); } return g_string_free (result, FALSE); }