Blob Blame History Raw
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * soup-tld.c
 *
 * Copyright (C) 2012 Igalia S.L.
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <string.h>

#include <glib/gi18n-lib.h>

#include "soup-tld.h"
#include "soup.h"
#include "soup-tld-private.h"

/**
 * SECTION:soup-tld
 * @short_description: Top-Level Domain Utilities
 *
 * These functions can be used to parse hostnames to attempt to determine
 * what part of the name belongs to the domain owner, and what part is
 * simply a "public suffix" such as ".com".
 */

static void soup_tld_ensure_rules_hash_table (void);
static const char *soup_tld_get_base_domain_internal (const char *hostname,
						      guint       additional_domains,
						      GError    **error);

static GHashTable *rules = NULL;
static SoupTLDEntry tld_entries[] = {
#include "tld_data.inc"
};

/* Stores the entries data in a hash table to ease and speed up
 * searches.
 */
static void
soup_tld_ensure_rules_hash_table (void)
{
	static gsize init = 0;

	if (g_once_init_enter (&init)) {
		int i;

		rules = g_hash_table_new (g_str_hash, g_str_equal);
		for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
			g_hash_table_insert (rules, tld_entries[i].domain,
					     &(tld_entries[i].flags));
		g_once_init_leave (&init, 1);
	}
}

/**
 * soup_tld_get_base_domain:
 * @hostname: a hostname
 * @error: return location for a #GError, or %NULL to ignore
 *   errors. See #SoupTLDError for the available error codes
 *
 * Finds the base domain for a given @hostname. The base domain is
 * composed by the top level domain (such as .org, .com, .co.uk, etc)
 * plus the second level domain, for example for myhost.mydomain.com
 * it will return mydomain.com.
 *
 * Note that %NULL will be returned for private URLs (those not ending
 * with any well known TLD) because choosing a base domain for them
 * would be totally arbitrary.
 *
 * Prior to libsoup 2.46, this function required that @hostname be in
 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
 * UTF-8 or ASCII format (and the return value will be in the same
 * format).
 *
 * Returns: a pointer to the start of the base domain in @hostname. If
 * an error occurs, %NULL will be returned and @error set.
 *
 * Since: 2.40
 **/
const char *
soup_tld_get_base_domain (const char *hostname, GError **error)
{
	g_return_val_if_fail (hostname, NULL);

	return soup_tld_get_base_domain_internal (hostname, 1, error);
}

/**
 * soup_tld_domain_is_public_suffix:
 * @domain: a domain name
 *
 * Looks whether the @domain passed as argument is a public domain
 * suffix (.org, .com, .co.uk, etc) or not.
 *
 * Prior to libsoup 2.46, this function required that @domain be in
 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
 * UTF-8 or ASCII format.
 *
 * Returns: %TRUE if it is a public domain, %FALSE otherwise.
 *
 * Since: 2.40
 **/
gboolean
soup_tld_domain_is_public_suffix (const char *domain)
{
	const char *base_domain;
	GError *error = NULL;

	g_return_val_if_fail (domain, FALSE);

	/* Skip the leading '.' if present */
	if (*domain == '.' && !*(++domain))
		g_return_val_if_reached (FALSE);

	base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
	if (g_strcmp0 (domain, base_domain)) {
		g_clear_error (&error);
		return FALSE;
	}

	if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
		g_error_free (error);
		return FALSE;
	}

	if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
	    g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
		g_error_free (error);
		g_return_val_if_reached (FALSE);
	}

	g_clear_error (&error);

	return TRUE;
}

/**
 * SOUP_TLD_ERROR:
 *
 * The #GError domain for soup-tld-related errors.
 *
 * Since: 2.40
 */
/**
 * SoupTLDError:
 * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
 *   invalid.
 * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
 *   actually an IP address (and thus has no base domain or
 *   public suffix).
 * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
 *   did not have enough components. Eg, calling
 *   soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
 * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
 *   no recognized public suffix.
 *
 * Error codes for %SOUP_TLD_ERROR.
 *
 * Since: 2.40
 */

GQuark
soup_tld_error_quark (void)
{
	static GQuark error;
	if (!error)
		error = g_quark_from_static_string ("soup_tld_error_quark");
	return error;
}

static const char *
soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
{
	char *prev_domain, *cur_domain, *next_dot;
	gint add_domains;
	const char *orig_hostname = NULL, *tld;
	char *utf8_hostname = NULL;

	soup_tld_ensure_rules_hash_table ();

	if (g_hostname_is_ip_address (hostname)) {
		g_set_error_literal (error, SOUP_TLD_ERROR,
				     SOUP_TLD_ERROR_IS_IP_ADDRESS,
				     _("Hostname is an IP address"));
		return NULL;
	}

	if (g_hostname_is_ascii_encoded (hostname)) {
		orig_hostname = hostname;
		hostname = utf8_hostname = g_hostname_to_unicode (hostname);
		if (!hostname) {
			g_set_error_literal (error, SOUP_TLD_ERROR,
					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
					     _("Invalid hostname"));
			return NULL;
		}
	}

	cur_domain = (char *) hostname;
	tld = cur_domain;
	prev_domain = NULL;
	/* Process matching rules from longest to shortest. Logic
	 * based on Mozilla's implementation of nsEffectiveTLDService.
	 */
	while (TRUE) {
		char *orig_domain;
		gboolean domain_found;
		int *flags;
		char *normalized_domain = NULL;
		int domain_length;

		/* Valid hostnames neither start with a dot nor have more than one
		 * dot together.
		 */
		if (*cur_domain == '.') {
			g_set_error_literal (error, SOUP_TLD_ERROR,
					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
					     _("Invalid hostname"));
			g_free (utf8_hostname);
			return NULL;
		}

		next_dot = strchr (cur_domain, '.');

		/* Discard trailing dot if any before looking up. */
		domain_length = strlen (cur_domain);
		if (cur_domain[domain_length - 1] == '.')
			normalized_domain = g_strndup (cur_domain, domain_length - 1);
		domain_found = g_hash_table_lookup_extended (rules, normalized_domain ? normalized_domain : cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags);
		g_free (normalized_domain);
		normalized_domain = NULL;

		/* We compare the keys just to be sure that we haven't hit a collision */
		if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
			if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
				/* If we match a *. rule and there were no previous exceptions
				 * nor previous domains then treat it as an exact match.
				 */
				tld = prev_domain ? prev_domain : cur_domain;
				break;
			} else if (*flags == SOUP_TLD_RULE_NORMAL) {
				tld = cur_domain;
				break;
			} else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
				tld = next_dot + 1;
				break;
			}
		}

		/* If we hit the top and haven't matched yet, then it
		 * has no public suffix.
		 */
		if (!next_dot) {
			g_set_error_literal (error, SOUP_TLD_ERROR,
					     SOUP_TLD_ERROR_NO_BASE_DOMAIN,
					     _("Hostname has no base domain"));
			g_free (utf8_hostname);
			return NULL;
		}

		prev_domain = cur_domain;
		cur_domain = next_dot + 1;
	}

	if (orig_hostname) {
		int dots;
		const char *p;

		/* Count the number of dots that appear after tld in
		 * utf8_hostname, and then find the corresponding spot
		 * in orig_hostname;
		 */
		for (p = tld, dots = 0; *p; p++) {
			if (*p == '.')
				dots++;
		}

		for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
			if (*(p - 1) == '.') {
				if (dots)
					dots--;
				else
					break;
			}
		}
		/* It's not possible for utf8_hostname to have had
		 * more dots than orig_hostname.
		 */
		g_assert (dots == 0);

		tld = p;
		g_free (utf8_hostname);
		hostname = orig_hostname;
	}

	/* Include the additional number of domains requested. */
	add_domains = additional_domains;
	while (tld != hostname) {
		if (*(--tld) == '.' && (!(add_domains--))) {
			++add_domains;
			++tld;
			break;
		}
	}

	/* If additional_domains > 0 then we haven't found enough additional domains. */
	if (add_domains) {
		g_set_error_literal (error, SOUP_TLD_ERROR,
				     SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
				     _("Not enough domains"));
		return NULL;
	}

	return tld;
}