Blame glib/gpattern.c

Packit ae235b
/* GLIB - Library of useful routines for C programming
Packit ae235b
 * Copyright (C) 1995-1997, 1999  Peter Mattis, Red Hat, Inc.
Packit ae235b
 *
Packit ae235b
 * This library is free software; you can redistribute it and/or
Packit ae235b
 * modify it under the terms of the GNU Lesser General Public
Packit ae235b
 * License as published by the Free Software Foundation; either
Packit ae235b
 * version 2.1 of the License, or (at your option) any later version.
Packit ae235b
 *
Packit ae235b
 * This library is distributed in the hope that it will be useful,
Packit ae235b
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Packit ae235b
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Packit ae235b
 * Lesser General Public License for more details.
Packit ae235b
 *
Packit ae235b
 * You should have received a copy of the GNU Lesser General Public
Packit ae235b
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
Packit ae235b
 */
Packit ae235b
Packit ae235b
#include "config.h"
Packit ae235b
Packit ae235b
#include <string.h>
Packit ae235b
Packit ae235b
#include "gpattern.h"
Packit ae235b
Packit ae235b
#include "gmacros.h"
Packit ae235b
#include "gmessages.h"
Packit ae235b
#include "gmem.h"
Packit ae235b
#include "gunicode.h"
Packit ae235b
#include "gutils.h" 
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * SECTION:patterns
Packit ae235b
 * @title: Glob-style pattern matching
Packit ae235b
 * @short_description: matches strings against patterns containing '*'
Packit ae235b
 *                     (wildcard) and '?' (joker)
Packit ae235b
 *
Packit ae235b
 * The g_pattern_match* functions match a string
Packit ae235b
 * against a pattern containing '*' and '?' wildcards with similar
Packit ae235b
 * semantics as the standard glob() function: '*' matches an arbitrary,
Packit ae235b
 * possibly empty, string, '?' matches an arbitrary character.
Packit ae235b
 *
Packit ae235b
 * Note that in contrast to glob(), the '/' character can be matched by
Packit ae235b
 * the wildcards, there are no '[...]' character ranges and '*' and '?'
Packit ae235b
 * can not be escaped to include them literally in a pattern.
Packit ae235b
 *
Packit ae235b
 * When multiple strings must be matched against the same pattern, it
Packit ae235b
 * is better to compile the pattern to a #GPatternSpec using
Packit ae235b
 * g_pattern_spec_new() and use g_pattern_match_string() instead of
Packit ae235b
 * g_pattern_match_simple(). This avoids the overhead of repeated
Packit ae235b
 * pattern compilation.
Packit ae235b
 **/
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * GPatternSpec:
Packit ae235b
 *
Packit ae235b
 * A GPatternSpec struct is the 'compiled' form of a pattern. This
Packit ae235b
 * structure is opaque and its fields cannot be accessed directly.
Packit ae235b
 */
Packit ae235b
Packit ae235b
/* keep enum and structure of gpattern.c and patterntest.c in sync */
Packit ae235b
typedef enum
Packit ae235b
{
Packit ae235b
  G_MATCH_ALL,       /* "*A?A*" */
Packit ae235b
  G_MATCH_ALL_TAIL,  /* "*A?AA" */
Packit ae235b
  G_MATCH_HEAD,      /* "AAAA*" */
Packit ae235b
  G_MATCH_TAIL,      /* "*AAAA" */
Packit ae235b
  G_MATCH_EXACT,     /* "AAAAA" */
Packit ae235b
  G_MATCH_LAST
Packit ae235b
} GMatchType;
Packit ae235b
Packit ae235b
struct _GPatternSpec
Packit ae235b
{
Packit ae235b
  GMatchType match_type;
Packit ae235b
  guint      pattern_length;
Packit ae235b
  guint      min_length;
Packit ae235b
  guint      max_length;
Packit ae235b
  gchar     *pattern;
Packit ae235b
};
Packit ae235b
Packit ae235b
Packit ae235b
/* --- functions --- */
Packit ae235b
static inline gboolean
Packit ae235b
g_pattern_ph_match (const gchar *match_pattern,
Packit ae235b
		    const gchar *match_string,
Packit ae235b
		    gboolean    *wildcard_reached_p)
Packit ae235b
{
Packit ae235b
  const gchar *pattern, *string;
Packit ae235b
  gchar ch;
Packit ae235b
Packit ae235b
  pattern = match_pattern;
Packit ae235b
  string = match_string;
Packit ae235b
Packit ae235b
  ch = *pattern;
Packit ae235b
  pattern++;
Packit ae235b
  while (ch)
Packit ae235b
    {
Packit ae235b
      switch (ch)
Packit ae235b
	{
Packit ae235b
	case '?':
Packit ae235b
	  if (!*string)
Packit ae235b
	    return FALSE;
Packit ae235b
	  string = g_utf8_next_char (string);
Packit ae235b
	  break;
Packit ae235b
Packit ae235b
	case '*':
Packit ae235b
	  *wildcard_reached_p = TRUE;
Packit ae235b
	  do
Packit ae235b
	    {
Packit ae235b
	      ch = *pattern;
Packit ae235b
	      pattern++;
Packit ae235b
	      if (ch == '?')
Packit ae235b
		{
Packit ae235b
		  if (!*string)
Packit ae235b
		    return FALSE;
Packit ae235b
		  string = g_utf8_next_char (string);
Packit ae235b
		}
Packit ae235b
	    }
Packit ae235b
	  while (ch == '*' || ch == '?');
Packit ae235b
	  if (!ch)
Packit ae235b
	    return TRUE;
Packit ae235b
	  do
Packit ae235b
	    {
Packit ae235b
              gboolean next_wildcard_reached = FALSE;
Packit ae235b
	      while (ch != *string)
Packit ae235b
		{
Packit ae235b
		  if (!*string)
Packit ae235b
		    return FALSE;
Packit ae235b
		  string = g_utf8_next_char (string);
Packit ae235b
		}
Packit ae235b
	      string++;
Packit ae235b
	      if (g_pattern_ph_match (pattern, string, &next_wildcard_reached))
Packit ae235b
		return TRUE;
Packit ae235b
              if (next_wildcard_reached)
Packit ae235b
                /* the forthcoming pattern substring up to the next wildcard has
Packit ae235b
                 * been matched, but a mismatch occoured for the rest of the
Packit ae235b
                 * pattern, following the next wildcard.
Packit ae235b
                 * there's no need to advance the current match position any
Packit ae235b
                 * further if the rest pattern will not match.
Packit ae235b
                 */
Packit ae235b
		return FALSE;
Packit ae235b
	    }
Packit ae235b
	  while (*string);
Packit ae235b
	  break;
Packit ae235b
Packit ae235b
	default:
Packit ae235b
	  if (ch == *string)
Packit ae235b
	    string++;
Packit ae235b
	  else
Packit ae235b
	    return FALSE;
Packit ae235b
	  break;
Packit ae235b
	}
Packit ae235b
Packit ae235b
      ch = *pattern;
Packit ae235b
      pattern++;
Packit ae235b
    }
Packit ae235b
Packit ae235b
  return *string == 0;
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_match:
Packit ae235b
 * @pspec: a #GPatternSpec
Packit ae235b
 * @string_length: the length of @string (in bytes, i.e. strlen(),
Packit ae235b
 *     not g_utf8_strlen())
Packit ae235b
 * @string: the UTF-8 encoded string to match
Packit ae235b
 * @string_reversed: (nullable): the reverse of @string or %NULL
Packit ae235b
 *
Packit ae235b
 * Matches a string against a compiled pattern. Passing the correct
Packit ae235b
 * length of the string given is mandatory. The reversed string can be
Packit ae235b
 * omitted by passing %NULL, this is more efficient if the reversed
Packit ae235b
 * version of the string to be matched is not at hand, as
Packit ae235b
 * g_pattern_match() will only construct it if the compiled pattern
Packit ae235b
 * requires reverse matches.
Packit ae235b
 *
Packit ae235b
 * Note that, if the user code will (possibly) match a string against a
Packit ae235b
 * multitude of patterns containing wildcards, chances are high that
Packit ae235b
 * some patterns will require a reversed string. In this case, it's
Packit ae235b
 * more efficient to provide the reversed string to avoid multiple
Packit ae235b
 * constructions thereof in the various calls to g_pattern_match().
Packit ae235b
 *
Packit ae235b
 * Note also that the reverse of a UTF-8 encoded string can in general
Packit ae235b
 * not be obtained by g_strreverse(). This works only if the string
Packit ae235b
 * does not contain any multibyte characters. GLib offers the
Packit ae235b
 * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
Packit ae235b
 *
Packit ae235b
 * Returns: %TRUE if @string matches @pspec
Packit ae235b
 **/
Packit ae235b
gboolean
Packit ae235b
g_pattern_match (GPatternSpec *pspec,
Packit ae235b
		 guint         string_length,
Packit ae235b
		 const gchar  *string,
Packit ae235b
		 const gchar  *string_reversed)
Packit ae235b
{
Packit ae235b
  g_return_val_if_fail (pspec != NULL, FALSE);
Packit ae235b
  g_return_val_if_fail (string != NULL, FALSE);
Packit ae235b
Packit ae235b
  if (string_length < pspec->min_length ||
Packit ae235b
      string_length > pspec->max_length)
Packit ae235b
    return FALSE;
Packit ae235b
Packit ae235b
  switch (pspec->match_type)
Packit ae235b
    {
Packit ae235b
      gboolean dummy;
Packit ae235b
    case G_MATCH_ALL:
Packit ae235b
      return g_pattern_ph_match (pspec->pattern, string, &dummy);
Packit ae235b
    case G_MATCH_ALL_TAIL:
Packit ae235b
      if (string_reversed)
Packit ae235b
	return g_pattern_ph_match (pspec->pattern, string_reversed, &dummy);
Packit ae235b
      else
Packit ae235b
	{
Packit ae235b
          gboolean result;
Packit ae235b
          gchar *tmp;
Packit ae235b
	  tmp = g_utf8_strreverse (string, string_length);
Packit ae235b
	  result = g_pattern_ph_match (pspec->pattern, tmp, &dummy);
Packit ae235b
	  g_free (tmp);
Packit ae235b
	  return result;
Packit ae235b
	}
Packit ae235b
    case G_MATCH_HEAD:
Packit ae235b
      if (pspec->pattern_length == string_length)
Packit ae235b
	return strcmp (pspec->pattern, string) == 0;
Packit ae235b
      else if (pspec->pattern_length)
Packit ae235b
	return strncmp (pspec->pattern, string, pspec->pattern_length) == 0;
Packit ae235b
      else
Packit ae235b
	return TRUE;
Packit ae235b
    case G_MATCH_TAIL:
Packit ae235b
      if (pspec->pattern_length)
Packit ae235b
        return strcmp (pspec->pattern, string + (string_length - pspec->pattern_length)) == 0;
Packit ae235b
      else
Packit ae235b
	return TRUE;
Packit ae235b
    case G_MATCH_EXACT:
Packit ae235b
      if (pspec->pattern_length != string_length)
Packit ae235b
        return FALSE;
Packit ae235b
      else
Packit ae235b
        return strcmp (pspec->pattern, string) == 0;
Packit ae235b
    default:
Packit ae235b
      g_return_val_if_fail (pspec->match_type < G_MATCH_LAST, FALSE);
Packit ae235b
      return FALSE;
Packit ae235b
    }
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_spec_new:
Packit ae235b
 * @pattern: a zero-terminated UTF-8 encoded string
Packit ae235b
 *
Packit ae235b
 * Compiles a pattern to a #GPatternSpec.
Packit ae235b
 *
Packit ae235b
 * Returns: a newly-allocated #GPatternSpec
Packit ae235b
 **/
Packit ae235b
GPatternSpec*
Packit ae235b
g_pattern_spec_new (const gchar *pattern)
Packit ae235b
{
Packit ae235b
  GPatternSpec *pspec;
Packit ae235b
  gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
Packit ae235b
  gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
Packit ae235b
  gboolean follows_wildcard = FALSE;
Packit ae235b
  guint pending_jokers = 0;
Packit ae235b
  const gchar *s;
Packit ae235b
  gchar *d;
Packit ae235b
  guint i;
Packit ae235b
  
Packit ae235b
  g_return_val_if_fail (pattern != NULL, NULL);
Packit ae235b
Packit ae235b
  /* canonicalize pattern and collect necessary stats */
Packit ae235b
  pspec = g_new (GPatternSpec, 1);
Packit ae235b
  pspec->pattern_length = strlen (pattern);
Packit ae235b
  pspec->min_length = 0;
Packit ae235b
  pspec->max_length = 0;
Packit ae235b
  pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
Packit ae235b
  d = pspec->pattern;
Packit ae235b
  for (i = 0, s = pattern; *s != 0; s++)
Packit ae235b
    {
Packit ae235b
      switch (*s)
Packit ae235b
	{
Packit ae235b
	case '*':
Packit ae235b
	  if (follows_wildcard)	/* compress multiple wildcards */
Packit ae235b
	    {
Packit ae235b
	      pspec->pattern_length--;
Packit ae235b
	      continue;
Packit ae235b
	    }
Packit ae235b
	  follows_wildcard = TRUE;
Packit ae235b
	  if (hw_pos < 0)
Packit ae235b
	    hw_pos = i;
Packit ae235b
	  tw_pos = i;
Packit ae235b
	  break;
Packit ae235b
	case '?':
Packit ae235b
	  pending_jokers++;
Packit ae235b
	  pspec->min_length++;
Packit ae235b
	  pspec->max_length += 4; /* maximum UTF-8 character length */
Packit ae235b
	  continue;
Packit ae235b
	default:
Packit ae235b
	  for (; pending_jokers; pending_jokers--, i++) {
Packit ae235b
	    *d++ = '?';
Packit ae235b
  	    if (hj_pos < 0)
Packit ae235b
	     hj_pos = i;
Packit ae235b
	    tj_pos = i;
Packit ae235b
	  }
Packit ae235b
	  follows_wildcard = FALSE;
Packit ae235b
	  pspec->min_length++;
Packit ae235b
	  pspec->max_length++;
Packit ae235b
	  break;
Packit ae235b
	}
Packit ae235b
      *d++ = *s;
Packit ae235b
      i++;
Packit ae235b
    }
Packit ae235b
  for (; pending_jokers; pending_jokers--) {
Packit ae235b
    *d++ = '?';
Packit ae235b
    if (hj_pos < 0)
Packit ae235b
      hj_pos = i;
Packit ae235b
    tj_pos = i;
Packit ae235b
  }
Packit ae235b
  *d++ = 0;
Packit ae235b
  seen_joker = hj_pos >= 0;
Packit ae235b
  seen_wildcard = hw_pos >= 0;
Packit ae235b
  more_wildcards = seen_wildcard && hw_pos != tw_pos;
Packit ae235b
  if (seen_wildcard)
Packit ae235b
    pspec->max_length = G_MAXUINT;
Packit ae235b
Packit ae235b
  /* special case sole head/tail wildcard or exact matches */
Packit ae235b
  if (!seen_joker && !more_wildcards)
Packit ae235b
    {
Packit ae235b
      if (pspec->pattern[0] == '*')
Packit ae235b
	{
Packit ae235b
	  pspec->match_type = G_MATCH_TAIL;
Packit ae235b
          memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
Packit ae235b
	  pspec->pattern[pspec->pattern_length] = 0;
Packit ae235b
	  return pspec;
Packit ae235b
	}
Packit ae235b
      if (pspec->pattern_length > 0 &&
Packit ae235b
	  pspec->pattern[pspec->pattern_length - 1] == '*')
Packit ae235b
	{
Packit ae235b
	  pspec->match_type = G_MATCH_HEAD;
Packit ae235b
	  pspec->pattern[--pspec->pattern_length] = 0;
Packit ae235b
	  return pspec;
Packit ae235b
	}
Packit ae235b
      if (!seen_wildcard)
Packit ae235b
	{
Packit ae235b
	  pspec->match_type = G_MATCH_EXACT;
Packit ae235b
	  return pspec;
Packit ae235b
	}
Packit ae235b
    }
Packit ae235b
Packit ae235b
  /* now just need to distinguish between head or tail match start */
Packit ae235b
  tw_pos = pspec->pattern_length - 1 - tw_pos;	/* last pos to tail distance */
Packit ae235b
  tj_pos = pspec->pattern_length - 1 - tj_pos;	/* last pos to tail distance */
Packit ae235b
  if (seen_wildcard)
Packit ae235b
    pspec->match_type = tw_pos > hw_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
Packit ae235b
  else /* seen_joker */
Packit ae235b
    pspec->match_type = tj_pos > hj_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
Packit ae235b
  if (pspec->match_type == G_MATCH_ALL_TAIL) {
Packit ae235b
    gchar *tmp = pspec->pattern;
Packit ae235b
    pspec->pattern = g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
Packit ae235b
    g_free (tmp);
Packit ae235b
  }
Packit ae235b
  return pspec;
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_spec_free:
Packit ae235b
 * @pspec: a #GPatternSpec
Packit ae235b
 *
Packit ae235b
 * Frees the memory allocated for the #GPatternSpec.
Packit ae235b
 **/
Packit ae235b
void
Packit ae235b
g_pattern_spec_free (GPatternSpec *pspec)
Packit ae235b
{
Packit ae235b
  g_return_if_fail (pspec != NULL);
Packit ae235b
Packit ae235b
  g_free (pspec->pattern);
Packit ae235b
  g_free (pspec);
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_spec_equal:
Packit ae235b
 * @pspec1: a #GPatternSpec
Packit ae235b
 * @pspec2: another #GPatternSpec
Packit ae235b
 *
Packit ae235b
 * Compares two compiled pattern specs and returns whether they will
Packit ae235b
 * match the same set of strings.
Packit ae235b
 *
Packit ae235b
 * Returns: Whether the compiled patterns are equal
Packit ae235b
 **/
Packit ae235b
gboolean
Packit ae235b
g_pattern_spec_equal (GPatternSpec *pspec1,
Packit ae235b
		      GPatternSpec *pspec2)
Packit ae235b
{
Packit ae235b
  g_return_val_if_fail (pspec1 != NULL, FALSE);
Packit ae235b
  g_return_val_if_fail (pspec2 != NULL, FALSE);
Packit ae235b
Packit ae235b
  return (pspec1->pattern_length == pspec2->pattern_length &&
Packit ae235b
	  pspec1->match_type == pspec2->match_type &&
Packit ae235b
	  strcmp (pspec1->pattern, pspec2->pattern) == 0);
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_match_string:
Packit ae235b
 * @pspec: a #GPatternSpec
Packit ae235b
 * @string: the UTF-8 encoded string to match
Packit ae235b
 *
Packit ae235b
 * Matches a string against a compiled pattern. If the string is to be
Packit ae235b
 * matched against more than one pattern, consider using
Packit ae235b
 * g_pattern_match() instead while supplying the reversed string.
Packit ae235b
 *
Packit ae235b
 * Returns: %TRUE if @string matches @pspec
Packit ae235b
 **/
Packit ae235b
gboolean
Packit ae235b
g_pattern_match_string (GPatternSpec *pspec,
Packit ae235b
			const gchar  *string)
Packit ae235b
{
Packit ae235b
  g_return_val_if_fail (pspec != NULL, FALSE);
Packit ae235b
  g_return_val_if_fail (string != NULL, FALSE);
Packit ae235b
Packit ae235b
  return g_pattern_match (pspec, strlen (string), string, NULL);
Packit ae235b
}
Packit ae235b
Packit ae235b
/**
Packit ae235b
 * g_pattern_match_simple:
Packit ae235b
 * @pattern: the UTF-8 encoded pattern
Packit ae235b
 * @string: the UTF-8 encoded string to match
Packit ae235b
 *
Packit ae235b
 * Matches a string against a pattern given as a string. If this
Packit ae235b
 * function is to be called in a loop, it's more efficient to compile
Packit ae235b
 * the pattern once with g_pattern_spec_new() and call
Packit ae235b
 * g_pattern_match_string() repeatedly.
Packit ae235b
 *
Packit ae235b
 * Returns: %TRUE if @string matches @pspec
Packit ae235b
 **/
Packit ae235b
gboolean
Packit ae235b
g_pattern_match_simple (const gchar *pattern,
Packit ae235b
			const gchar *string)
Packit ae235b
{
Packit ae235b
  GPatternSpec *pspec;
Packit ae235b
  gboolean ergo;
Packit ae235b
Packit ae235b
  g_return_val_if_fail (pattern != NULL, FALSE);
Packit ae235b
  g_return_val_if_fail (string != NULL, FALSE);
Packit ae235b
Packit ae235b
  pspec = g_pattern_spec_new (pattern);
Packit ae235b
  ergo = g_pattern_match (pspec, strlen (string), string, NULL);
Packit ae235b
  g_pattern_spec_free (pspec);
Packit ae235b
Packit ae235b
  return ergo;
Packit ae235b
}