/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* * soup-tld.c * * Copyright (C) 2012 Igalia S.L. */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include "soup-tld.h" #include "soup.h" #include "soup-tld-private.h" /** * SECTION:soup-tld * @short_description: Top-Level Domain Utilities * * These functions can be used to parse hostnames to attempt to determine * what part of the name belongs to the domain owner, and what part is * simply a "public suffix" such as ".com". */ static void soup_tld_ensure_rules_hash_table (void); static const char *soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error); static GHashTable *rules = NULL; static SoupTLDEntry tld_entries[] = { #include "tld_data.inc" }; /* Stores the entries data in a hash table to ease and speed up * searches. */ static void soup_tld_ensure_rules_hash_table (void) { static gsize init = 0; if (g_once_init_enter (&init)) { int i; rules = g_hash_table_new (g_str_hash, g_str_equal); for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i) g_hash_table_insert (rules, tld_entries[i].domain, &(tld_entries[i].flags)); g_once_init_leave (&init, 1); } } /** * soup_tld_get_base_domain: * @hostname: a hostname * @error: return location for a #GError, or %NULL to ignore * errors. See #SoupTLDError for the available error codes * * Finds the base domain for a given @hostname. The base domain is * composed by the top level domain (such as .org, .com, .co.uk, etc) * plus the second level domain, for example for myhost.mydomain.com * it will return mydomain.com. * * Note that %NULL will be returned for private URLs (those not ending * with any well known TLD) because choosing a base domain for them * would be totally arbitrary. * * Prior to libsoup 2.46, this function required that @hostname be in * UTF-8 if it was an IDN. From 2.46 on, the name can be in either * UTF-8 or ASCII format (and the return value will be in the same * format). * * Returns: a pointer to the start of the base domain in @hostname. If * an error occurs, %NULL will be returned and @error set. * * Since: 2.40 **/ const char * soup_tld_get_base_domain (const char *hostname, GError **error) { g_return_val_if_fail (hostname, NULL); return soup_tld_get_base_domain_internal (hostname, 1, error); } /** * soup_tld_domain_is_public_suffix: * @domain: a domain name * * Looks whether the @domain passed as argument is a public domain * suffix (.org, .com, .co.uk, etc) or not. * * Prior to libsoup 2.46, this function required that @domain be in * UTF-8 if it was an IDN. From 2.46 on, the name can be in either * UTF-8 or ASCII format. * * Returns: %TRUE if it is a public domain, %FALSE otherwise. * * Since: 2.40 **/ gboolean soup_tld_domain_is_public_suffix (const char *domain) { const char *base_domain; GError *error = NULL; g_return_val_if_fail (domain, FALSE); /* Skip the leading '.' if present */ if (*domain == '.' && !*(++domain)) g_return_val_if_reached (FALSE); base_domain = soup_tld_get_base_domain_internal (domain, 0, &error); if (g_strcmp0 (domain, base_domain)) { g_clear_error (&error); return FALSE; } if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) { g_error_free (error); return FALSE; } if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) || g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) { g_error_free (error); g_return_val_if_reached (FALSE); } g_clear_error (&error); return TRUE; } /** * SOUP_TLD_ERROR: * * The #GError domain for soup-tld-related errors. * * Since: 2.40 */ /** * SoupTLDError: * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically * invalid. * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was * actually an IP address (and thus has no base domain or * public suffix). * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname * did not have enough components. Eg, calling * soup_tld_get_base_domain() on "co.uk". * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has * no recognized public suffix. * * Error codes for %SOUP_TLD_ERROR. * * Since: 2.40 */ GQuark soup_tld_error_quark (void) { static GQuark error; if (!error) error = g_quark_from_static_string ("soup_tld_error_quark"); return error; } static const char * soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error) { char *prev_domain, *cur_domain, *next_dot; gint add_domains; const char *orig_hostname = NULL, *tld; char *utf8_hostname = NULL; soup_tld_ensure_rules_hash_table (); if (g_hostname_is_ip_address (hostname)) { g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS, _("Hostname is an IP address")); return NULL; } if (g_hostname_is_ascii_encoded (hostname)) { orig_hostname = hostname; hostname = utf8_hostname = g_hostname_to_unicode (hostname); if (!hostname) { g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME, _("Invalid hostname")); return NULL; } } cur_domain = (char *) hostname; tld = cur_domain; prev_domain = NULL; /* Process matching rules from longest to shortest. Logic * based on Mozilla's implementation of nsEffectiveTLDService. */ while (TRUE) { char *orig_domain; gboolean domain_found; int *flags; char *normalized_domain = NULL; int domain_length; /* Valid hostnames neither start with a dot nor have more than one * dot together. */ if (*cur_domain == '.') { g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME, _("Invalid hostname")); g_free (utf8_hostname); return NULL; } next_dot = strchr (cur_domain, '.'); /* Discard trailing dot if any before looking up. */ domain_length = strlen (cur_domain); if (cur_domain[domain_length - 1] == '.') normalized_domain = g_strndup (cur_domain, domain_length - 1); domain_found = g_hash_table_lookup_extended (rules, normalized_domain ? normalized_domain : cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags); g_free (normalized_domain); normalized_domain = NULL; /* We compare the keys just to be sure that we haven't hit a collision */ if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) { if (*flags & SOUP_TLD_RULE_MATCH_ALL) { /* If we match a *. rule and there were no previous exceptions * nor previous domains then treat it as an exact match. */ tld = prev_domain ? prev_domain : cur_domain; break; } else if (*flags == SOUP_TLD_RULE_NORMAL) { tld = cur_domain; break; } else if (*flags & SOUP_TLD_RULE_EXCEPTION) { tld = next_dot + 1; break; } } /* If we hit the top and haven't matched yet, then it * has no public suffix. */ if (!next_dot) { g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN, _("Hostname has no base domain")); g_free (utf8_hostname); return NULL; } prev_domain = cur_domain; cur_domain = next_dot + 1; } if (orig_hostname) { int dots; const char *p; /* Count the number of dots that appear after tld in * utf8_hostname, and then find the corresponding spot * in orig_hostname; */ for (p = tld, dots = 0; *p; p++) { if (*p == '.') dots++; } for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) { if (*(p - 1) == '.') { if (dots) dots--; else break; } } /* It's not possible for utf8_hostname to have had * more dots than orig_hostname. */ g_assert (dots == 0); tld = p; g_free (utf8_hostname); hostname = orig_hostname; } /* Include the additional number of domains requested. */ add_domains = additional_domains; while (tld != hostname) { if (*(--tld) == '.' && (!(add_domains--))) { ++add_domains; ++tld; break; } } /* If additional_domains > 0 then we haven't found enough additional domains. */ if (add_domains) { g_set_error_literal (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS, _("Not enough domains")); return NULL; } return tld; }