|
Packit Service |
ca3877 |
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
Packit Service |
ca3877 |
/*
|
|
Packit Service |
ca3877 |
* soup-tld.c
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Copyright (C) 2012 Igalia S.L.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
#ifdef HAVE_CONFIG_H
|
|
Packit Service |
ca3877 |
#include <config.h>
|
|
Packit Service |
ca3877 |
#endif
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
#include <string.h>
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
#include <glib/gi18n-lib.h>
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
#include "soup-tld.h"
|
|
Packit Service |
ca3877 |
#include "soup.h"
|
|
Packit Service |
ca3877 |
#include "soup-tld-private.h"
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/**
|
|
Packit Service |
ca3877 |
* SECTION:soup-tld
|
|
Packit Service |
ca3877 |
* @short_description: Top-Level Domain Utilities
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* These functions can be used to parse hostnames to attempt to determine
|
|
Packit Service |
ca3877 |
* what part of the name belongs to the domain owner, and what part is
|
|
Packit Service |
ca3877 |
* simply a "public suffix" such as ".com".
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
static void soup_tld_ensure_rules_hash_table (void);
|
|
Packit Service |
ca3877 |
static const char *soup_tld_get_base_domain_internal (const char *hostname,
|
|
Packit Service |
ca3877 |
guint additional_domains,
|
|
Packit Service |
ca3877 |
GError **error);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
static GHashTable *rules = NULL;
|
|
Packit Service |
ca3877 |
static SoupTLDEntry tld_entries[] = {
|
|
Packit Service |
ca3877 |
#include "tld_data.inc"
|
|
Packit Service |
ca3877 |
};
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Stores the entries data in a hash table to ease and speed up
|
|
Packit Service |
ca3877 |
* searches.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
static void
|
|
Packit Service |
ca3877 |
soup_tld_ensure_rules_hash_table (void)
|
|
Packit Service |
ca3877 |
{
|
|
Packit Service |
ca3877 |
static gsize init = 0;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (g_once_init_enter (&init)) {
|
|
Packit Service |
ca3877 |
int i;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
rules = g_hash_table_new (g_str_hash, g_str_equal);
|
|
Packit Service |
ca3877 |
for (i = 0; i < G_N_ELEMENTS (tld_entries); ++i)
|
|
Packit Service |
ca3877 |
g_hash_table_insert (rules, tld_entries[i].domain,
|
|
Packit Service |
ca3877 |
&(tld_entries[i].flags));
|
|
Packit Service |
ca3877 |
g_once_init_leave (&init, 1);
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/**
|
|
Packit Service |
ca3877 |
* soup_tld_get_base_domain:
|
|
Packit Service |
ca3877 |
* @hostname: a hostname
|
|
Packit Service |
ca3877 |
* @error: return location for a #GError, or %NULL to ignore
|
|
Packit Service |
ca3877 |
* errors. See #SoupTLDError for the available error codes
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Finds the base domain for a given @hostname. The base domain is
|
|
Packit Service |
ca3877 |
* composed by the top level domain (such as .org, .com, .co.uk, etc)
|
|
Packit Service |
ca3877 |
* plus the second level domain, for example for myhost.mydomain.com
|
|
Packit Service |
ca3877 |
* it will return mydomain.com.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Note that %NULL will be returned for private URLs (those not ending
|
|
Packit Service |
ca3877 |
* with any well known TLD) because choosing a base domain for them
|
|
Packit Service |
ca3877 |
* would be totally arbitrary.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Prior to libsoup 2.46, this function required that @hostname be in
|
|
Packit Service |
ca3877 |
* UTF-8 if it was an IDN. From 2.46 on, the name can be in either
|
|
Packit Service |
ca3877 |
* UTF-8 or ASCII format (and the return value will be in the same
|
|
Packit Service |
ca3877 |
* format).
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Returns: a pointer to the start of the base domain in @hostname. If
|
|
Packit Service |
ca3877 |
* an error occurs, %NULL will be returned and @error set.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Since: 2.40
|
|
Packit Service |
ca3877 |
**/
|
|
Packit Service |
ca3877 |
const char *
|
|
Packit Service |
ca3877 |
soup_tld_get_base_domain (const char *hostname, GError **error)
|
|
Packit Service |
ca3877 |
{
|
|
Packit Service |
ca3877 |
g_return_val_if_fail (hostname, NULL);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
return soup_tld_get_base_domain_internal (hostname, 1, error);
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/**
|
|
Packit Service |
ca3877 |
* soup_tld_domain_is_public_suffix:
|
|
Packit Service |
ca3877 |
* @domain: a domain name
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Looks whether the @domain passed as argument is a public domain
|
|
Packit Service |
ca3877 |
* suffix (.org, .com, .co.uk, etc) or not.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Prior to libsoup 2.46, this function required that @domain be in
|
|
Packit Service |
ca3877 |
* UTF-8 if it was an IDN. From 2.46 on, the name can be in either
|
|
Packit Service |
ca3877 |
* UTF-8 or ASCII format.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Returns: %TRUE if it is a public domain, %FALSE otherwise.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Since: 2.40
|
|
Packit Service |
ca3877 |
**/
|
|
Packit Service |
ca3877 |
gboolean
|
|
Packit Service |
ca3877 |
soup_tld_domain_is_public_suffix (const char *domain)
|
|
Packit Service |
ca3877 |
{
|
|
Packit Service |
ca3877 |
const char *base_domain;
|
|
Packit Service |
ca3877 |
GError *error = NULL;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
g_return_val_if_fail (domain, FALSE);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Skip the leading '.' if present */
|
|
Packit Service |
ca3877 |
if (*domain == '.' && !*(++domain))
|
|
Packit Service |
ca3877 |
g_return_val_if_reached (FALSE);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
base_domain = soup_tld_get_base_domain_internal (domain, 0, &error);
|
|
Packit Service |
ca3877 |
if (g_strcmp0 (domain, base_domain)) {
|
|
Packit Service |
ca3877 |
g_clear_error (&error);
|
|
Packit Service |
ca3877 |
return FALSE;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_NO_BASE_DOMAIN)) {
|
|
Packit Service |
ca3877 |
g_error_free (error);
|
|
Packit Service |
ca3877 |
return FALSE;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_IS_IP_ADDRESS) ||
|
|
Packit Service |
ca3877 |
g_error_matches (error, SOUP_TLD_ERROR, SOUP_TLD_ERROR_INVALID_HOSTNAME)) {
|
|
Packit Service |
ca3877 |
g_error_free (error);
|
|
Packit Service |
ca3877 |
g_return_val_if_reached (FALSE);
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
g_clear_error (&error);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
return TRUE;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/**
|
|
Packit Service |
ca3877 |
* SOUP_TLD_ERROR:
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* The #GError domain for soup-tld-related errors.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Since: 2.40
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
/**
|
|
Packit Service |
ca3877 |
* SoupTLDError:
|
|
Packit Service |
ca3877 |
* @SOUP_TLD_ERROR_INVALID_HOSTNAME: A hostname was syntactically
|
|
Packit Service |
ca3877 |
* invalid.
|
|
Packit Service |
ca3877 |
* @SOUP_TLD_ERROR_IS_IP_ADDRESS: The passed-in "hostname" was
|
|
Packit Service |
ca3877 |
* actually an IP address (and thus has no base domain or
|
|
Packit Service |
ca3877 |
* public suffix).
|
|
Packit Service |
ca3877 |
* @SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS: The passed-in hostname
|
|
Packit Service |
ca3877 |
* did not have enough components. Eg, calling
|
|
Packit Service |
ca3877 |
* soup_tld_get_base_domain() on <literal>"co.uk"</literal>.
|
|
Packit Service |
ca3877 |
* @SOUP_TLD_ERROR_NO_BASE_DOMAIN: The passed-in hostname has
|
|
Packit Service |
ca3877 |
* no recognized public suffix.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Error codes for %SOUP_TLD_ERROR.
|
|
Packit Service |
ca3877 |
*
|
|
Packit Service |
ca3877 |
* Since: 2.40
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
GQuark
|
|
Packit Service |
ca3877 |
soup_tld_error_quark (void)
|
|
Packit Service |
ca3877 |
{
|
|
Packit Service |
ca3877 |
static GQuark error;
|
|
Packit Service |
ca3877 |
if (!error)
|
|
Packit Service |
ca3877 |
error = g_quark_from_static_string ("soup_tld_error_quark");
|
|
Packit Service |
ca3877 |
return error;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
static const char *
|
|
Packit Service |
ca3877 |
soup_tld_get_base_domain_internal (const char *hostname, guint additional_domains, GError **error)
|
|
Packit Service |
ca3877 |
{
|
|
Packit Service |
ca3877 |
char *prev_domain, *cur_domain, *next_dot;
|
|
Packit Service |
ca3877 |
gint add_domains;
|
|
Packit Service |
ca3877 |
const char *orig_hostname = NULL, *tld;
|
|
Packit Service |
ca3877 |
char *utf8_hostname = NULL;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
soup_tld_ensure_rules_hash_table ();
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (g_hostname_is_ip_address (hostname)) {
|
|
Packit Service |
ca3877 |
g_set_error_literal (error, SOUP_TLD_ERROR,
|
|
Packit Service |
ca3877 |
SOUP_TLD_ERROR_IS_IP_ADDRESS,
|
|
Packit Service |
ca3877 |
_("Hostname is an IP address"));
|
|
Packit Service |
ca3877 |
return NULL;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (g_hostname_is_ascii_encoded (hostname)) {
|
|
Packit Service |
ca3877 |
orig_hostname = hostname;
|
|
Packit Service |
ca3877 |
hostname = utf8_hostname = g_hostname_to_unicode (hostname);
|
|
Packit Service |
ca3877 |
if (!hostname) {
|
|
Packit Service |
ca3877 |
g_set_error_literal (error, SOUP_TLD_ERROR,
|
|
Packit Service |
ca3877 |
SOUP_TLD_ERROR_INVALID_HOSTNAME,
|
|
Packit Service |
ca3877 |
_("Invalid hostname"));
|
|
Packit Service |
ca3877 |
return NULL;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
cur_domain = (char *) hostname;
|
|
Packit Service |
ca3877 |
tld = cur_domain;
|
|
Packit Service |
ca3877 |
prev_domain = NULL;
|
|
Packit Service |
ca3877 |
/* Process matching rules from longest to shortest. Logic
|
|
Packit Service |
ca3877 |
* based on Mozilla's implementation of nsEffectiveTLDService.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
while (TRUE) {
|
|
Packit Service |
ca3877 |
char *orig_domain;
|
|
Packit Service |
ca3877 |
gboolean domain_found;
|
|
Packit Service |
ca3877 |
int *flags;
|
|
Packit Service |
ca3877 |
char *normalized_domain = NULL;
|
|
Packit Service |
ca3877 |
int domain_length;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Valid hostnames neither start with a dot nor have more than one
|
|
Packit Service |
ca3877 |
* dot together.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
if (*cur_domain == '.') {
|
|
Packit Service |
ca3877 |
g_set_error_literal (error, SOUP_TLD_ERROR,
|
|
Packit Service |
ca3877 |
SOUP_TLD_ERROR_INVALID_HOSTNAME,
|
|
Packit Service |
ca3877 |
_("Invalid hostname"));
|
|
Packit Service |
ca3877 |
g_free (utf8_hostname);
|
|
Packit Service |
ca3877 |
return NULL;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
next_dot = strchr (cur_domain, '.');
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Discard trailing dot if any before looking up. */
|
|
Packit Service |
ca3877 |
domain_length = strlen (cur_domain);
|
|
Packit Service |
ca3877 |
if (cur_domain[domain_length - 1] == '.')
|
|
Packit Service |
ca3877 |
normalized_domain = g_strndup (cur_domain, domain_length - 1);
|
|
Packit Service |
ca3877 |
domain_found = g_hash_table_lookup_extended (rules, normalized_domain ? normalized_domain : cur_domain, (gpointer *) &orig_domain, (gpointer *) &flags);
|
|
Packit Service |
ca3877 |
g_free (normalized_domain);
|
|
Packit Service |
ca3877 |
normalized_domain = NULL;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* We compare the keys just to be sure that we haven't hit a collision */
|
|
Packit Service |
ca3877 |
if (domain_found && !strncmp (orig_domain, cur_domain, strlen (orig_domain))) {
|
|
Packit Service |
ca3877 |
if (*flags & SOUP_TLD_RULE_MATCH_ALL) {
|
|
Packit Service |
ca3877 |
/* If we match a *. rule and there were no previous exceptions
|
|
Packit Service |
ca3877 |
* nor previous domains then treat it as an exact match.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
tld = prev_domain ? prev_domain : cur_domain;
|
|
Packit Service |
ca3877 |
break;
|
|
Packit Service |
ca3877 |
} else if (*flags == SOUP_TLD_RULE_NORMAL) {
|
|
Packit Service |
ca3877 |
tld = cur_domain;
|
|
Packit Service |
ca3877 |
break;
|
|
Packit Service |
ca3877 |
} else if (*flags & SOUP_TLD_RULE_EXCEPTION) {
|
|
Packit Service |
ca3877 |
tld = next_dot + 1;
|
|
Packit Service |
ca3877 |
break;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* If we hit the top and haven't matched yet, then it
|
|
Packit Service |
ca3877 |
* has no public suffix.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
if (!next_dot) {
|
|
Packit Service |
ca3877 |
g_set_error_literal (error, SOUP_TLD_ERROR,
|
|
Packit Service |
ca3877 |
SOUP_TLD_ERROR_NO_BASE_DOMAIN,
|
|
Packit Service |
ca3877 |
_("Hostname has no base domain"));
|
|
Packit Service |
ca3877 |
g_free (utf8_hostname);
|
|
Packit Service |
ca3877 |
return NULL;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
prev_domain = cur_domain;
|
|
Packit Service |
ca3877 |
cur_domain = next_dot + 1;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
if (orig_hostname) {
|
|
Packit Service |
ca3877 |
int dots;
|
|
Packit Service |
ca3877 |
const char *p;
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Count the number of dots that appear after tld in
|
|
Packit Service |
ca3877 |
* utf8_hostname, and then find the corresponding spot
|
|
Packit Service |
ca3877 |
* in orig_hostname;
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
for (p = tld, dots = 0; *p; p++) {
|
|
Packit Service |
ca3877 |
if (*p == '.')
|
|
Packit Service |
ca3877 |
dots++;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
for (p = orig_hostname + strlen (orig_hostname); p > orig_hostname; p--) {
|
|
Packit Service |
ca3877 |
if (*(p - 1) == '.') {
|
|
Packit Service |
ca3877 |
if (dots)
|
|
Packit Service |
ca3877 |
dots--;
|
|
Packit Service |
ca3877 |
else
|
|
Packit Service |
ca3877 |
break;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
/* It's not possible for utf8_hostname to have had
|
|
Packit Service |
ca3877 |
* more dots than orig_hostname.
|
|
Packit Service |
ca3877 |
*/
|
|
Packit Service |
ca3877 |
g_assert (dots == 0);
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
tld = p;
|
|
Packit Service |
ca3877 |
g_free (utf8_hostname);
|
|
Packit Service |
ca3877 |
hostname = orig_hostname;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* Include the additional number of domains requested. */
|
|
Packit Service |
ca3877 |
add_domains = additional_domains;
|
|
Packit Service |
ca3877 |
while (tld != hostname) {
|
|
Packit Service |
ca3877 |
if (*(--tld) == '.' && (!(add_domains--))) {
|
|
Packit Service |
ca3877 |
++add_domains;
|
|
Packit Service |
ca3877 |
++tld;
|
|
Packit Service |
ca3877 |
break;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
/* If additional_domains > 0 then we haven't found enough additional domains. */
|
|
Packit Service |
ca3877 |
if (add_domains) {
|
|
Packit Service |
ca3877 |
g_set_error_literal (error, SOUP_TLD_ERROR,
|
|
Packit Service |
ca3877 |
SOUP_TLD_ERROR_NOT_ENOUGH_DOMAINS,
|
|
Packit Service |
ca3877 |
_("Not enough domains"));
|
|
Packit Service |
ca3877 |
return NULL;
|
|
Packit Service |
ca3877 |
}
|
|
Packit Service |
ca3877 |
|
|
Packit Service |
ca3877 |
return tld;
|
|
Packit Service |
ca3877 |
}
|