/* Copyright (C) 1995 Bjoern Beutel. */
/* Description. =============================================================*/
/* This module contains function to compile and execute pattern matching
* strings (regular expressions). */
/* Includes. ================================================================*/
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "patterns.h"
/* Constants. ===============================================================*/
#define SPECIAL_CHARS ".[]-^()*?+|\\" /* Characters with a special meaning. */
/* These are the instructions for matching a pattern.
*
* A pattern is a 0-terminated sequence of CHARs, defined as follows:
* - P[] is the coded pattern string.
* - PI is the current index into P[].
* - PS[] is the stack of indexes into P[], used for backtracking.
* - T[] is the text to be examined.
* - TI is the current index into T[].
* - TS[] is the stack of indexes into T[], used for backtracking.
* - SI is the stack index, used for PIS[] and TIS[].
* - VB[I] and VE[I] store beginning and end, resp., for variable no. I. */
enum
{
/* code 0 is EOS */
PAT_JUMP = 1, /* PI += (byte_t) P[PI+1]; */
PAT_JUMP_NOW, /* SI++; PS[SI] = PI+2; TS[SI] = TI; PI += (byte_t) P[PI+1]; */
PAT_JUMP_LATER, /* SP++; PS[SI] = PI + (byte_t) P[PI]; TS[SI] = I; PI++; */
PAT_MATCH_ANY, /* if T[TI] != EOS then TI++; else fail; */
PAT_MATCH_CLASS, /* if (T[TI] in {P[ PI + 1 ],..,P[ PI + P[PI] ]})
* { TI++; PI += P[PI]+1; } else fail; */
PAT_MATCH_NOT_CLASS, /* if (T[TI] in {P[ PI + 1 ],..,P[ PI + P[PI] ]})
* fail; else { TI++; PI += P[PI]+1; } */
PAT_START_VAR_0, /* VB[0] = I; PI++; */
PAT_START_VAR_1, /* VB[1] = I; PI++; */
PAT_START_VAR_2, /* VB[2] = I; PI++; */
PAT_START_VAR_3, /* VB[3] = I; PI++; */
PAT_START_VAR_4, /* VB[4] = I; PI++; */
PAT_END_VAR_0, /* VE[0] = I; PI++; */
PAT_END_VAR_1, /* VE[1] = I; PI++; */
PAT_END_VAR_2, /* VE[2] = I; PI++; */
PAT_END_VAR_3, /* VE[3] = I; PI++; */
PAT_END_VAR_4, /* VE[4] = I; PI++; */
PAT_COUNT
};
/* Types. ===================================================================*/
typedef struct {string_t string, pattern;} pattern_state_t;
/* Global variables. ========================================================*/
string_t pattern_var[ PATTERN_VAR_MAX ]; /* Pattern variables. */
/* Variables. ===============================================================*/
static pattern_state_t *stack; /* Stack used for backtracking. */
static int_t stack_size;
/* Forwards. ================================================================*/
static void compile_pattern_local( text_t *text,
string_t *string_p,
bool_t *may_be_empty );
/* Functions. ===============================================================*/
static bool_t
is_pattern_char( string_t s )
{
/* The following expression is true for each non-ASCII character. */
return ((*s == '\\' && s[1] != EOS && strchr( SPECIAL_CHARS, s[1] ) != NULL)
|| (ORD(*s) >= PAT_COUNT && strchr( SPECIAL_CHARS, *s ) == NULL));
}
/*---------------------------------------------------------------------------*/
static gunichar
pattern_char( string_t *string_p )
/* See if *STRING_P points to a valid char or to an escape sequence.
* Return the character code. Show an error if not valid. */
{
string_t s;
gunichar c;
s = *string_p;
if (s[0] == '\\' && s[1] != EOS && strchr( SPECIAL_CHARS, s[1] ) != NULL)
{
c = s[1];
s += 2;
}
else if (ORD(*s) >= PAT_COUNT && strchr( SPECIAL_CHARS, s[0] ) == NULL)
{
c = g_unichar_tolower( g_utf8_get_char( s ) );
s = g_utf8_next_char( s );
}
else if (s[0] == EOS)
complain( "Unexpected end of pattern." );
else
complain( "Invalid char \"%u\" in pattern.", g_utf8_get_char(s) );
*string_p = s;
return c;
}
/*---------------------------------------------------------------------------*/
static char_t
offset( int_t offs )
/* Return OFFS as a char. */
{
/* See if OFFS can be stored in a char. */
if ((byte_t) offs != offs || offs == 0)
complain( "Pattern too complex." );
return (byte_t) offs;
}
/*---------------------------------------------------------------------------*/
static void
compile_char_class( text_t *text, string_t *string_p )
/* Compile a character class at *STRING_P.
* Save the resulting pattern in TEXT. */
{
string_t s;
gunichar ca, ce, c; /* First char, last char and current char for ranges. */
int_t patch_index;
s = *string_p;
s++;
if (*s == '^')
{
s++;
add_char_to_text( text, PAT_MATCH_NOT_CLASS );
}
else
add_char_to_text( text, PAT_MATCH_CLASS );
patch_index = text->string_size;
/* Read chars and ranges. */
do
{
if (*s == EOS)
complain( "Missing \"]\" in pattern." );
ca = pattern_char( &s );
if (*s == '-')
{
s++;
ce = pattern_char( &s );
if (ca > ce)
complain( "Invalid range \"%u-%u\" in pattern.", ca, ce );
}
else
ce = ca;
for (c = ca; c <= ce; c++)
add_unichar_to_text( text, c );
} while (*s != ']');
*string_p = ++s;
insert_char_in_text( text, offset( text->string_size - patch_index ),
patch_index );
}
/*---------------------------------------------------------------------------*/
static bool_t
check_greedy( string_t *string_p )
{
if (**string_p == '?')
{
(*string_p)++;
return FALSE;
}
else
return TRUE;
}
/*---------------------------------------------------------------------------*/
static void
compile_atom( text_t *text, string_t *string_p, bool_t *may_be_empty )
/* Compile an atom at *STRING_P.
* Save the resulting pattern in TEXT.
* MAY_BE_EMPTY becomes TRUE iff the atom may match an empty string. */
{
string_t s;
int_t start, length;
bool_t may_be_empty_local, greedy;
s = *string_p;
*may_be_empty = TRUE;
while (TRUE)
{
may_be_empty_local = FALSE;
start = text->string_size;
if (*s == '[')
compile_char_class( text, &s );
else if (*s == '.')
{
s++;
add_char_to_text( text, PAT_MATCH_ANY );
}
else if (*s == '(')
{
s++;
compile_pattern_local( text, &s, &may_be_empty_local );
if (*s++ != ')')
complain( "Missing \")\" in pattern." );
}
else if (is_pattern_char( s ))
add_unichar_to_text( text, pattern_char( &s ) );
else
break;
length = text->string_size - start;
/* There may be a postfix operator. */
if (*s == '?')
{
s++;
greedy = check_greedy( &s );
if (greedy)
insert_char_in_text( text, PAT_JUMP_LATER, start );
else
insert_char_in_text( text, PAT_JUMP_NOW, start );
insert_char_in_text( text, offset( 2 + length ), start + 1 );
}
else if (*s == '*')
{
s++;
if (may_be_empty_local)
complain( "Pattern argument for \"*\" may match empty string." );
greedy = check_greedy( &s );
insert_char_in_text( text, PAT_JUMP, start );
insert_char_in_text( text, offset( 2 + length ), start + 1 );
if (greedy)
add_char_to_text( text, PAT_JUMP_NOW );
else
add_char_to_text( text, PAT_JUMP_LATER );
add_char_to_text( text, offset( -length ) );
}
else if (*s == '+')
{
s++;
if (may_be_empty_local)
complain( "Pattern argument for \"+\" may match empty string.");
greedy = check_greedy( &s );
if (greedy)
add_char_to_text( text, PAT_JUMP_NOW );
else
add_char_to_text( text, PAT_JUMP_LATER );
add_char_to_text( text, offset( -length ) );
*may_be_empty = FALSE;
}
else
*may_be_empty &= may_be_empty_local;
}
*string_p = s;
}
/*---------------------------------------------------------------------------*/
static void
compile_pattern_local (text_t *text, string_t *string_p, bool_t *may_be_empty)
/* Convert STRING_P to a pattern to be used as input to "match_pattern".
* If PATTERN_VAR_NO != -1, mark the pattern so the string matching this
* pattern will be stored in PATTERN_VAR[ PATTERN_VAR_NO ].
* The result pattern must be freed after usage. */
{
string_t s;
int_t start, length;
bool_t may_be_empty_local;
s = *string_p;
start = text->string_size;
compile_atom( text, &s, may_be_empty );
length = text->string_size - start;
while (*s == '|')
{
s++;
/* Add jump from start of last alternative to start of this alternative. */
insert_char_in_text( text, PAT_JUMP_LATER, start++ );
insert_char_in_text( text, offset( length + 4 ), start++ );
/* Compile this alternative. */
start = text->string_size;
compile_atom( text, &s, &may_be_empty_local );
length = text->string_size - start;
*may_be_empty |= may_be_empty_local;
/* Add jump from end of last alternative to end of this alternative. */
insert_char_in_text( text, PAT_JUMP, start++ );
if (*s == '|')
insert_char_in_text( text, offset( length + 4 ), start++ );
else
insert_char_in_text( text, offset( length + 2 ), start++ );
}
*string_p = s;
}
/*---------------------------------------------------------------------------*/
string_t
compile_pattern( string_t string, int_t pattern_var_no )
/* Convert STRING to a pattern to be used as input to "match_pattern".
* If PATTERN_VAR_NO != -1, mark the pattern so the string matching this
* pattern will be stored in PATTERN_VAR[ PATTERN_VAR_NO ].
* The result pattern must be freed after usage. */
{
text_t *text;
bool_t may_be_empty;
text = new_text();
if (pattern_var_no != -1)
add_char_to_text( text, PAT_START_VAR_0 + pattern_var_no );
compile_pattern_local( text, &string, &may_be_empty );
if (*string != EOS)
complain( "Illegal char \"%u\" in pattern.", g_utf8_get_char( string ) );
if (pattern_var_no != -1)
add_char_to_text( text, PAT_END_VAR_0 + pattern_var_no );
return text_to_string( &text );
}
/*---------------------------------------------------------------------------*/
bool_t
match_pattern( string_t string, string_t pattern )
/* Test whether STRING matches PATTERN (a string of chars compiled with
* "compile_pattern") and set substrings in PATTERN_VAR.
* The substrings can be freed after usage. */
{
struct {string_t start; string_t end;} var[ PATTERN_VAR_MAX ];
int_t sp, i;
bool_t found_mismatch;
string_t index;
gunichar c;
sp = 0;
found_mismatch = FALSE;
/* Clear all variables. */
for (i = 0; i < PATTERN_VAR_MAX; i++)
var[i].start = var[i].end = NULL;
while (! found_mismatch)
{
switch (*pattern)
{
case EOS:
if (*string == EOS)
{
for (i = 0; i < PATTERN_VAR_MAX; i++)
{
if (var[i].start != NULL && var[i].end != NULL)
{
free_mem( &pattern_var[i] );
pattern_var[i] = new_string( var[i].start, var[i].end );
}
}
return TRUE;
}
else
found_mismatch = TRUE;
break;
case PAT_JUMP:
pattern += (byte_t) pattern[1];
break;
case PAT_JUMP_NOW:
if (sp == stack_size)
{
stack_size = renew_vector( &stack, sizeof( pattern_state_t ),
stack_size + 100 );
}
stack[ sp ].string = string;
stack[ sp ].pattern = pattern + 2;
sp++;
pattern += (byte_t) pattern[1];
break;
case PAT_JUMP_LATER:
if (sp == stack_size)
{
stack_size = renew_vector( &stack, sizeof( pattern_state_t ),
stack_size + 100 );
}
stack[ sp ].string = string;
stack[ sp ].pattern = pattern + (byte_t) pattern[1];
sp++;
pattern += 2;
break;
case PAT_MATCH_ANY:
if (*string == EOS)
found_mismatch = TRUE;
else
{
pattern++;
string = g_utf8_next_char( string );
}
break;
case PAT_MATCH_CLASS:
if (*string == EOS)
found_mismatch = TRUE;
else
{
index = pattern + 2;
c = g_unichar_tolower( g_utf8_get_char( string ) );
string = g_utf8_next_char( string );
pattern += (byte_t) pattern[1] + 2;
while (index < pattern && c != g_utf8_get_char( index ))
index = g_utf8_next_char( index );
if (index >= pattern)
found_mismatch = TRUE;
}
break;
case PAT_MATCH_NOT_CLASS:
if (*string == EOS)
found_mismatch = TRUE;
else
{
index = pattern + 2;
c = g_unichar_tolower( g_utf8_get_char( string ) );
string = g_utf8_next_char( string );
pattern += (byte_t) pattern[1] + 2;
while (index < pattern && c != g_utf8_get_char( index ))
index = g_utf8_next_char( index );
if (index < pattern)
found_mismatch = TRUE;
}
break;
case PAT_START_VAR_0:
case PAT_START_VAR_1:
case PAT_START_VAR_2:
case PAT_START_VAR_3:
case PAT_START_VAR_4:
var[ *pattern++ - PAT_START_VAR_0 ].start = string;
break;
case PAT_END_VAR_0:
case PAT_END_VAR_1:
case PAT_END_VAR_2:
case PAT_END_VAR_3:
case PAT_END_VAR_4:
var[ *pattern++ - PAT_END_VAR_0 ].end = string;
break;
default:
if (*string == EOS)
found_mismatch = TRUE;
else
{
c = g_unichar_tolower( g_utf8_get_char( string ) );
string = g_utf8_next_char( string );
if (c != g_utf8_get_char( pattern ))
found_mismatch = TRUE;
else
pattern = g_utf8_next_char( pattern );
}
break;
}
/* If this path was not successful and there is another path, try it. */
if (found_mismatch && sp > 0)
{
sp--;
string = stack[ sp ].string;
pattern = stack[ sp ].pattern;
found_mismatch = FALSE;
}
}
return FALSE;
}
/*---------------------------------------------------------------------------*/
void
terminate_patterns( void )
/* Terminate this module. */
{
int_t i;
for (i = 0; i < PATTERN_VAR_MAX; i++)
free_mem( &pattern_var[i] );
free_mem( &stack );
stack_size = 0;
}
/* End of file. =============================================================*/