Blame libsoup/soup-tld.c

Packit Service ca3877
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
Packit Service ca3877
/*
Packit Service ca3877
 * soup-tld.c
Packit Service ca3877
 *
Packit Service ca3877
 * Copyright (C) 2012 Igalia S.L.
Packit Service ca3877
 */
Packit Service ca3877
Packit Service ca3877
#ifdef HAVE_CONFIG_H
Packit Service ca3877
#include <config.h>
Packit Service ca3877
#endif
Packit Service ca3877
Packit Service ca3877
#include <string.h>
Packit Service ca3877
Packit Service ca3877
#include <glib/gi18n-lib.h>
Packit Service ca3877
Packit Service ca3877
#include "soup-tld.h"
Packit Service ca3877
#include "soup.h"
Packit Service ca3877
#include "soup-tld-private.h"
Packit Service ca3877
Packit Service ca3877
/**
Packit Service ca3877
 * SECTION:soup-tld
Packit Service ca3877
 * @short_description: Top-Level Domain Utilities
Packit Service ca3877
 *
Packit Service ca3877
 * These functions can be used to parse hostnames to attempt to determine
Packit Service ca3877
 * what part of the name belongs to the domain owner, and what part is
Packit Service ca3877
 * simply a "public suffix" such as ".com".
Packit Service ca3877
 */
Packit Service ca3877
Packit Service ca3877
static void soup_tld_ensure_rules_hash_table (void);
Packit Service ca3877
static const char *soup_tld_get_base_domain_internal (const char *hostname,
Packit Service ca3877
						      guint       additional_domains,
Packit Service ca3877
						      GError    **error);
Packit Service ca3877
Packit Service ca3877
static GHashTable *rules = NULL;
Packit Service ca3877
static SoupTLDEntry tld_entries[] = {
Packit Service ca3877
#include "tld_data.inc"
Packit Service ca3877
};
Packit Service ca3877
Packit Service ca3877
/* Stores the entries data in a hash table to ease and speed up
Packit Service ca3877
 * searches.
Packit Service ca3877
 */
Packit Service ca3877
static void
Packit Service ca3877
soup_tld_ensure_rules_hash_table (void)
Packit Service ca3877
{
Packit Service ca3877
	static gsize init = 0;
Packit Service ca3877
Packit Service ca3877
	if (g_once_init_enter (&init)) {
Packit Service ca3877
		int i;
Packit Service ca3877
Packit Service ca3877
		rules = g_hash_table_new (g_str_hash, g_str_equal);
Packit Service ca3877
		for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
Packit Service ca3877
			g_hash_table_insert (rules, tld_entries[i].domain,
Packit Service ca3877
					     &(tld_entries[i].flags));
Packit Service ca3877
		g_once_init_leave (&init, 1);
Packit Service ca3877
	}
Packit Service ca3877
}
Packit Service ca3877
Packit Service ca3877
/**
Packit Service ca3877
 * soup_tld_get_base_domain:
Packit Service ca3877
 * @hostname: a hostname
Packit Service ca3877
 * @error: return location for a #GError, or %NULL to ignore
Packit Service ca3877
 *   errors. See #SoupTLDError for the available error codes
Packit Service ca3877
 *
Packit Service ca3877
 * Finds the base domain for a given @hostname. The base domain is
Packit Service ca3877
 * composed by the top level domain (such as .org, .com, .co.uk, etc)
Packit Service ca3877
 * plus the second level domain, for example for myhost.mydomain.com
Packit Service ca3877
 * it will return mydomain.com.
Packit Service ca3877
 *
Packit Service ca3877
 * Note that %NULL will be returned for private URLs (those not ending
Packit Service ca3877
 * with any well known TLD) because choosing a base domain for them
Packit Service ca3877
 * would be totally arbitrary.
Packit Service ca3877
 *
Packit Service ca3877
 * Prior to libsoup 2.46, this function required that @hostname be in
Packit Service ca3877
 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
Packit Service ca3877
 * UTF-8 or ASCII format (and the return value will be in the same
Packit Service ca3877
 * format).
Packit Service ca3877
 *
Packit Service ca3877
 * Returns: a pointer to the start of the base domain in @hostname. If
Packit Service ca3877
 * an error occurs, %NULL will be returned and @error set.
Packit Service ca3877
 *
Packit Service ca3877
 * Since: 2.40
Packit Service ca3877
 **/
Packit Service ca3877
const char *
Packit Service ca3877
soup_tld_get_base_domain (const char *hostname, GError **error)
Packit Service ca3877
{
Packit Service ca3877
	g_return_val_if_fail (hostname, NULL);
Packit Service ca3877
Packit Service ca3877
	return soup_tld_get_base_domain_internal (hostname, 1, error);
Packit Service ca3877
}
Packit Service ca3877
Packit Service ca3877
/**
Packit Service ca3877
 * soup_tld_domain_is_public_suffix:
Packit Service ca3877
 * @domain: a domain name
Packit Service ca3877
 *
Packit Service ca3877
 * Looks whether the @domain passed as argument is a public domain
Packit Service ca3877
 * suffix (.org, .com, .co.uk, etc) or not.
Packit Service ca3877
 *
Packit Service ca3877
 * Prior to libsoup 2.46, this function required that @domain be in
Packit Service ca3877
 * UTF-8 if it was an IDN. From 2.46 on, the name can be in either
Packit Service ca3877
 * UTF-8 or ASCII format.
Packit Service ca3877
 *
Packit Service ca3877
 * Returns: %TRUE if it is a public domain, %FALSE otherwise.
Packit Service ca3877
 *
Packit Service ca3877
 * Since: 2.40
Packit Service ca3877
 **/
Packit Service ca3877
gboolean
Packit Service ca3877
soup_tld_domain_is_public_suffix (const char *domain)
Packit Service ca3877
{
Packit Service ca3877
	const char *base_domain;
Packit Service ca3877
	GError *error = NULL;
Packit Service ca3877
Packit Service ca3877
	g_return_val_if_fail (domain, FALSE);
Packit Service ca3877
Packit Service ca3877
	/* Skip the leading '.' if present */
Packit Service ca3877
	if (*domain == '.' && !*(++domain))
Packit Service ca3877
		g_return_val_if_reached (FALSE);
Packit Service ca3877
Packit Service ca3877
	base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
Packit Service ca3877
	if (g_strcmp0 (domain, base_domain)) {
Packit Service ca3877
		g_clear_error (&error);
Packit Service ca3877
		return FALSE;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
Packit Service ca3877
		g_error_free (error);
Packit Service ca3877
		return FALSE;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
Packit Service ca3877
	    g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
Packit Service ca3877
		g_error_free (error);
Packit Service ca3877
		g_return_val_if_reached (FALSE);
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	g_clear_error (&error);
Packit Service ca3877
Packit Service ca3877
	return TRUE;
Packit Service ca3877
}
Packit Service ca3877
Packit Service ca3877
/**
Packit Service ca3877
 * SOUP_TLD_ERROR:
Packit Service ca3877
 *
Packit Service ca3877
 * The #GError domain for soup-tld-related errors.
Packit Service ca3877
 *
Packit Service ca3877
 * Since: 2.40
Packit Service ca3877
 */
Packit Service ca3877
/**
Packit Service ca3877
 * SoupTLDError:
Packit Service ca3877
 * @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
Packit Service ca3877
 *   invalid.
Packit Service ca3877
 * @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
Packit Service ca3877
 *   actually an IP address (and thus has no base domain or
Packit Service ca3877
 *   public suffix).
Packit Service ca3877
 * @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
Packit Service ca3877
 *   did not have enough components. Eg, calling
Packit Service ca3877
 *   soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
Packit Service ca3877
 * @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
Packit Service ca3877
 *   no recognized public suffix.
Packit Service ca3877
 *
Packit Service ca3877
 * Error codes for %SOUP_TLD_ERROR.
Packit Service ca3877
 *
Packit Service ca3877
 * Since: 2.40
Packit Service ca3877
 */
Packit Service ca3877
Packit Service ca3877
GQuark
Packit Service ca3877
soup_tld_error_quark (void)
Packit Service ca3877
{
Packit Service ca3877
	static GQuark error;
Packit Service ca3877
	if (!error)
Packit Service ca3877
		error = g_quark_from_static_string ("soup_tld_error_quark");
Packit Service ca3877
	return error;
Packit Service ca3877
}
Packit Service ca3877
Packit Service ca3877
static const char *
Packit Service ca3877
soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
Packit Service ca3877
{
Packit Service ca3877
	char *prev_domain, *cur_domain, *next_dot;
Packit Service ca3877
	gint add_domains;
Packit Service ca3877
	const char *orig_hostname = NULL, *tld;
Packit Service ca3877
	char *utf8_hostname = NULL;
Packit Service ca3877
Packit Service ca3877
	soup_tld_ensure_rules_hash_table ();
Packit Service ca3877
Packit Service ca3877
	if (g_hostname_is_ip_address (hostname)) {
Packit Service ca3877
		g_set_error_literal (error, SOUP_TLD_ERROR,
Packit Service ca3877
				     SOUP_TLD_ERROR_IS_IP_ADDRESS,
Packit Service ca3877
				     _("Hostname is an IP address"));
Packit Service ca3877
		return NULL;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	if (g_hostname_is_ascii_encoded (hostname)) {
Packit Service ca3877
		orig_hostname = hostname;
Packit Service ca3877
		hostname = utf8_hostname = g_hostname_to_unicode (hostname);
Packit Service ca3877
		if (!hostname) {
Packit Service ca3877
			g_set_error_literal (error, SOUP_TLD_ERROR,
Packit Service ca3877
					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
Packit Service ca3877
					     _("Invalid hostname"));
Packit Service ca3877
			return NULL;
Packit Service ca3877
		}
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	cur_domain = (char *) hostname;
Packit Service ca3877
	tld = cur_domain;
Packit Service ca3877
	prev_domain = NULL;
Packit Service ca3877
	/* Process matching rules from longest to shortest. Logic
Packit Service ca3877
	 * based on Mozilla's implementation of nsEffectiveTLDService.
Packit Service ca3877
	 */
Packit Service ca3877
	while (TRUE) {
Packit Service ca3877
		char *orig_domain;
Packit Service ca3877
		gboolean domain_found;
Packit Service ca3877
		int *flags;
Packit Service ca3877
		char *normalized_domain = NULL;
Packit Service ca3877
		int domain_length;
Packit Service ca3877
Packit Service ca3877
		/* Valid hostnames neither start with a dot nor have more than one
Packit Service ca3877
		 * dot together.
Packit Service ca3877
		 */
Packit Service ca3877
		if (*cur_domain == '.') {
Packit Service ca3877
			g_set_error_literal (error, SOUP_TLD_ERROR,
Packit Service ca3877
					     SOUP_TLD_ERROR_INVALID_HOSTNAME,
Packit Service ca3877
					     _("Invalid hostname"));
Packit Service ca3877
			g_free (utf8_hostname);
Packit Service ca3877
			return NULL;
Packit Service ca3877
		}
Packit Service ca3877
Packit Service ca3877
		next_dot = strchr (cur_domain, '.');
Packit Service ca3877
Packit Service ca3877
		/* Discard trailing dot if any before looking up. */
Packit Service ca3877
		domain_length = strlen (cur_domain);
Packit Service ca3877
		if (cur_domain[domain_length - 1] == '.')
Packit Service ca3877
			normalized_domain = g_strndup (cur_domain, domain_length - 1);
Packit Service ca3877
		domain_found = g_hash_table_lookup_extended (rules, normalized_domain ? normalized_domain : cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags);
Packit Service ca3877
		g_free (normalized_domain);
Packit Service ca3877
		normalized_domain = NULL;
Packit Service ca3877
Packit Service ca3877
		/* We compare the keys just to be sure that we haven't hit a collision */
Packit Service ca3877
		if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
Packit Service ca3877
			if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
Packit Service ca3877
				/* If we match a *. rule and there were no previous exceptions
Packit Service ca3877
				 * nor previous domains then treat it as an exact match.
Packit Service ca3877
				 */
Packit Service ca3877
				tld = prev_domain ? prev_domain : cur_domain;
Packit Service ca3877
				break;
Packit Service ca3877
			} else if (*flags == SOUP_TLD_RULE_NORMAL) {
Packit Service ca3877
				tld = cur_domain;
Packit Service ca3877
				break;
Packit Service ca3877
			} else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
Packit Service ca3877
				tld = next_dot + 1;
Packit Service ca3877
				break;
Packit Service ca3877
			}
Packit Service ca3877
		}
Packit Service ca3877
Packit Service ca3877
		/* If we hit the top and haven't matched yet, then it
Packit Service ca3877
		 * has no public suffix.
Packit Service ca3877
		 */
Packit Service ca3877
		if (!next_dot) {
Packit Service ca3877
			g_set_error_literal (error, SOUP_TLD_ERROR,
Packit Service ca3877
					     SOUP_TLD_ERROR_NO_BASE_DOMAIN,
Packit Service ca3877
					     _("Hostname has no base domain"));
Packit Service ca3877
			g_free (utf8_hostname);
Packit Service ca3877
			return NULL;
Packit Service ca3877
		}
Packit Service ca3877
Packit Service ca3877
		prev_domain = cur_domain;
Packit Service ca3877
		cur_domain = next_dot + 1;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	if (orig_hostname) {
Packit Service ca3877
		int dots;
Packit Service ca3877
		const char *p;
Packit Service ca3877
Packit Service ca3877
		/* Count the number of dots that appear after tld in
Packit Service ca3877
		 * utf8_hostname, and then find the corresponding spot
Packit Service ca3877
		 * in orig_hostname;
Packit Service ca3877
		 */
Packit Service ca3877
		for (p = tld, dots = 0; *p; p++) {
Packit Service ca3877
			if (*p == '.')
Packit Service ca3877
				dots++;
Packit Service ca3877
		}
Packit Service ca3877
Packit Service ca3877
		for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
Packit Service ca3877
			if (*(p - 1) == '.') {
Packit Service ca3877
				if (dots)
Packit Service ca3877
					dots--;
Packit Service ca3877
				else
Packit Service ca3877
					break;
Packit Service ca3877
			}
Packit Service ca3877
		}
Packit Service ca3877
		/* It's not possible for utf8_hostname to have had
Packit Service ca3877
		 * more dots than orig_hostname.
Packit Service ca3877
		 */
Packit Service ca3877
		g_assert (dots == 0);
Packit Service ca3877
Packit Service ca3877
		tld = p;
Packit Service ca3877
		g_free (utf8_hostname);
Packit Service ca3877
		hostname = orig_hostname;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	/* Include the additional number of domains requested. */
Packit Service ca3877
	add_domains = additional_domains;
Packit Service ca3877
	while (tld != hostname) {
Packit Service ca3877
		if (*(--tld) == '.' && (!(add_domains--))) {
Packit Service ca3877
			++add_domains;
Packit Service ca3877
			++tld;
Packit Service ca3877
			break;
Packit Service ca3877
		}
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	/* If additional_domains > 0 then we haven't found enough additional domains. */
Packit Service ca3877
	if (add_domains) {
Packit Service ca3877
		g_set_error_literal (error, SOUP_TLD_ERROR,
Packit Service ca3877
				     SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
Packit Service ca3877
				     _("Not enough domains"));
Packit Service ca3877
		return NULL;
Packit Service ca3877
	}
Packit Service ca3877
Packit Service ca3877
	return tld;
Packit Service ca3877
}