/* Copyright (C) 1995 Bjoern Beutel. */

/* Description. =============================================================*/

/* This module supports scanning (lexical analysis) of malaga source files. */

/* Includes. ================================================================*/

#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <setjmp.h>
#include <glib.h>
#include "basic.h"
#include "files.h"
#include "scanner.h"

/* Constants. ===============================================================*/

/* List of all keywords and their token codes.
 * (This list must be maintained in alphabetical order.) */
static struct { string_t name; int_t code; } keywords[ NUMBER_OF_KEYWORDS ] = 
{ 
  { "accept", TOK_ACCEPT },
  { "allo_rule", TOK_ALLO_RULE },
  { "and", TOK_AND },
  { "assert", TOK_ASSERT },
  { "break", TOK_BREAK },
  { "choose", TOK_CHOOSE },
  { "combi_rule", TOK_COMBI_RULE },
  { "continue", TOK_CONTINUE },
  { "default", TOK_DEFAULT },
  { "define", TOK_DEFINE },
  { "else", TOK_ELSE },
  { "elseif", TOK_ELSEIF },
  { "end", TOK_END },
  { "end_rule", TOK_END_RULE },
  { "error", TOK_ERROR },
  { "foreach", TOK_FOREACH },
  { "greater", TOK_GREATER },
  { "greater_equal", TOK_GREATER_EQUAL },
  { "if", TOK_IF },
  { "in", TOK_IN },
  { "include", TOK_INCLUDE },
  { "initial", TOK_INITIAL },
  { "input_filter", TOK_INPUT_FILTER },
  { "less", TOK_LESS },
  { "less_equal", TOK_LESS_EQUAL },
  { "matches", TOK_MATCHES },
  { "not", TOK_NOT },
  { "or", TOK_OR },
  { "output_filter", TOK_OUTPUT_FILTER },
  { "parallel", TOK_PARALLEL },
  { "pruning_rule", TOK_PRUNING_RULE },
  { "repeat", TOK_REPEAT },
  { "require", TOK_REQUIRE },
  { "result", TOK_RESULT },
  { "return", TOK_RETURN },
  { "robust_rule", TOK_ROBUST_RULE },
  { "rules", TOK_RULES },
  { "select", TOK_SELECT },
  { "stop", TOK_STOP },
  { "subrule", TOK_SUBRULE },
  { "then", TOK_THEN },
  { "while", TOK_WHILE }
};

/* Types. ===================================================================*/

typedef struct /* A source stream for lexical analysis. */
{ 
  list_node_t *next; /* The next (including) source stream. */
  FILE *stream; /* The input stream for this include level. */
  string_t file_name; /* The name of the input file. */
  text_t *line; /* The current line. */
  string_t next_char_p; /* Pointer to the next char in LINE to be read. */
  int_t column; /* Column that has been read. */
  int_t line_number; /* Number of the line that has been read. */
  int_t next_char; /* Buffer NEXT_CHAR if this source is backed up. */
  int_t next_token; /* Buffer NEXT_TOKEN if this source is backed up. */
} source_t;

/* Global variables. ========================================================*/

int_t next_token;
string_t token_name;
char_t *token_string;
double token_number;

/* Variables. ===============================================================*/

static list_t sources; /* The list of sources, current source first. */

static string_t scanner_input;
/* If no file is included, the scanner reads its input from SCANNER_INPUT. */

static int_t next_char; /* The next unicode char to be read. */

static text_t *token_text; /* The text of the next token. */

/* Functions. ===============================================================*/

static void 
read_next_char( void )
/* Read the next char from input into NEXT_CHAR.
 * If end of input stream is reached, return EOF.
 * If no input stream is selected, read input from INPUT_BUFFER.
 * If reading from stream, update column information. */
{ 
  source_t *source;
  int_t c;

  source = (source_t *) sources.first;
  if (scanner_input != NULL) /* Read from a string. */
  { 
    if (*scanner_input == EOS)
      next_char = EOF;
    else
    {
      next_char = g_utf8_get_char( scanner_input );
      scanner_input = g_utf8_next_char( scanner_input );
    }
  } 
  else if (source != NULL) /* Read from a file. */
  { 
    /* Read a new line if current line is empty. */
    if (*source->next_char_p == EOS)
    {
      clear_text( source->line );
      do
      {
	c = getc( source->stream );
	if (c == EOS)
	  complain( "Null byte in \"%s\"", source->file_name );
	else if (c == EOF) 
	{
	  if (ferror( source->stream ))
	  {
	    complain( "Can't read from \"%s\": %s.", 
		      source->file_name, strerror( errno ) );
	  }
	  else 
	    break;
	}
	else
	  ADD_CHAR_TO_TEXT( source->line, c );
      } while (c != '\n');
      
      if (! g_utf8_validate( source->line->buffer, -1, NULL ))
	complain( "Illegal UTF-8 character in \"%s\".", source->file_name );
      source->next_char_p = source->line->buffer;
    }

    if (*source->next_char_p == EOS)
      next_char = EOF;
    else
    {
      /* Get next char from current line. */
      next_char = g_utf8_get_char( source->next_char_p );
      source->next_char_p = g_utf8_next_char( source->next_char_p );
      
      /* Update line and column information. */
      if (next_char == '\t') 
	source->column = (source->column + 8) & ~7;
      else if (next_char == '\n') 
      { 
	source->column = 0;
	source->line_number++;
      }
      else if (next_char == '\r')
	source->column = 0;
      else if (next_char != EOF) 
	source->column++;
    }
  } 
  else 
    next_char = EOF;
}

/*---------------------------------------------------------------------------*/

string_t 
current_file_name( void )
/* Return the name of the file reading from or NULL. */
{
  source_t *source;

  source = (source_t *) sources.first;
  if (source == NULL) 
    return NULL;
  return source->file_name;
}

/*---------------------------------------------------------------------------*/

int_t 
current_line_number( void )
/* Return the line number where the last char has been read or -1. */
{
  source_t *source;

  source = (source_t *) sources.first;
  if (source == NULL) 
    return -1;
  return source->line_number;
}

/*---------------------------------------------------------------------------*/

int_t 
current_column( void )
/* Return the column where the last char has been read or -1. */
{
  source_t *source;

  source = (source_t *) sources.first;
  if (source == NULL) 
    return -1;
  if (source->column == 0) 
    return 0;
  return source->column - 1; /* Let columns start with 0. */
}

/*---------------------------------------------------------------------------*/

void 
set_scanner_input( string_t input )
/* Make the scanner use INPUT as scanner input 
 * until "set_scanner_input( NULL )" is called.
 * INPUT must remain valid until then. */
{
  source_t *source;

  source = (source_t *) sources.first;
  scanner_input = input;
  if (input != NULL) 
  { 
    if (source != NULL) 
    { 
      source->next_char = next_char;
      source->next_token = next_token;
    }
    read_next_char();
    read_next_token();
  } 
  else if (source != NULL) 
  { 
    next_char = source->next_char;
    next_token = source->next_token;
  }
}

/*---------------------------------------------------------------------------*/

void 
begin_include( string_t file_name )
/* Open a new level of inclusion and read tokens from file FILE_NAME. */
{
  FILE *stream;
  source_t *source;

  source = (source_t *) sources.first;
  stream = open_stream( file_name, "r" );
  /* Next char of old source should be read later. */
  if (source != NULL) 
  { 
    source->next_char = next_char;
    source->next_token = next_token;
  }
  /* Create new source description. */
  source = new_node( &sources, sizeof( source_t ), LIST_START );
  source->line = new_text();
  source->next_char_p = source->line->buffer;
  source->file_name = file_name;
  source->line_number = 1;
  source->column = 0;
  source->stream = stream;
  read_next_char();
  read_next_token();
}

/*---------------------------------------------------------------------------*/

void 
end_include( void )
/* Stop reading from current source stream and read from former stream. */
{
  source_t *source;

  source = (source_t *) sources.first;
  close_stream( &source->stream, source->file_name );
  free_text( &source->line );
  free_first_node( &sources );
  if (sources.first != NULL) 
  { 
    source = (source_t *) sources.first;
    next_char = source->next_char;
    next_token = source->next_token;
  }
}

/*---------------------------------------------------------------------------*/

void 
end_includes( void )
/* Stop reading from all nested source streams. */
{
  while (sources.first != NULL) 
    end_include();
}

/*---------------------------------------------------------------------------*/

void 
init_scanner( void )
/* Initialise the scanner. */
{
  token_text = new_text();
}

/*---------------------------------------------------------------------------*/

void 
terminate_scanner( void )
/* Terminate the scanner, even when it's scanning. */
{
  source_t *source;

  scanner_input = NULL;
  FOREACH_FREE( source, sources ) 
  {
    close_stream( &source->stream, NULL );
    free_text( &source->line );
  }
  token_name = NULL;
  free_text( &token_text );
  free_mem( &token_string );
}

/*---------------------------------------------------------------------------*/

static void 
read_name( void )
/* Read rule name, variable, or keyword into TOKEN_NAME. */
{
  token_name = NULL;
  clear_text( token_text );

  while (next_char != EOF
         && (g_unichar_isalnum( next_char )
	     || next_char == '_' || next_char == '&' || next_char == '|'))
  { 
    add_unichar_to_text( token_text, next_char );
    read_next_char();
  }

  token_name = token_text->buffer;
  if (*token_name == EOS) 
    complain( "Illegal character in name." );
}

/*---------------------------------------------------------------------------*/

static int_t 
keyword_code( string_t name )
/* Look up NAME in the keyword table and return its token value.
 * If NAME is no keyword, return TOK_IDENT. */
{
  int_t lower, upper, middle, result;

  /* We do a binary search on the keywords.
   * A keyword must be in the range of keywords[ lower..upper ]. */
  lower = 0;
  upper = NUMBER_OF_KEYWORDS - 1;
  while (lower <= upper) 
  { 
    middle = (lower + upper) / 2;
    result = strcmp_no_case( name, keywords[ middle ].name );
    if (result < 0) 
      upper = middle - 1;
    else if (result > 0) 
      lower = middle + 1;
    else 
      return keywords[ middle ].code;
  }
  return TOK_IDENT;
}

/*---------------------------------------------------------------------------*/

static void 
read_number( void )
/* Read a floating point number. Save its value in TOKEN_NUMBER. */
{
  token_name = NULL;
  clear_text( token_text );

  while (next_char >= '0' && next_char <= '9') 
  { 
    add_char_to_text( token_text, next_char );
    read_next_char();
  }
  if (next_char == 'l' || next_char == 'L') 
    read_next_char();
  else if (next_char == 'r' || next_char == 'R') 
  { 
    insert_char_in_text( token_text, '-', 0 );
    read_next_char();
  } 
  else 
  { 
    if (next_char == '.') 
    { 
      add_char_to_text( token_text, next_char );
      read_next_char();
      if (next_char < '0' || next_char >'9') 
	complain( "Missing digits after \".\"." );
      while (next_char >= '0' && next_char <= '9') 
      { 
	add_char_to_text( token_text, next_char );
        read_next_char();
      }
    }
    if (next_char == 'E' || next_char == 'e') 
    { /* Read an exponent. */
      add_char_to_text( token_text, next_char );
      read_next_char();
      if (next_char == '-' || next_char == '+') 
      { 
	add_char_to_text( token_text, next_char );
        read_next_char();
      }
      if (next_char < '0' || next_char > '9') 
	complain( "Missing exponent." );
      while (next_char >= '0' && next_char <= '9') 
      { 
	add_char_to_text( token_text, next_char );
        read_next_char();
      }  
    }
  }
  if (sscanf( token_text->buffer, "%lf", &token_number ) != 1) 
    complain( "Illegal number." );
}

/*---------------------------------------------------------------------------*/

static void 
read_string( void )
/* Read a string. Save its value in TOKEN_STRING. */
{
  int_t i;
  u_int_t code;

  token_name = NULL;
  clear_text( token_text );
  read_next_char(); /* Overread beginning '"'. */
  while (next_char != '\"') 
  { 
    if (next_char == EOF || next_char == '\n') 
      complain( "Unterminated string at end of line." );
    if (next_char != '\\') 
    {
      add_unichar_to_text( token_text, next_char );
      read_next_char();
    }
    else
    { 
      read_next_char();
      if (next_char == '\\' || next_char == '\"')
      {
	add_char_to_text( token_text, next_char );
	read_next_char();
      }
      else if (next_char >= '0'  && next_char <= '7')
      { 
	code = 0;
	for (i = 0; i < 3; i++) 
	{ 
	  if (next_char >= '0' && next_char <= '7')
	    code = 8 * code + (next_char - '0');
	  else 
	    complain( "Escape sequence must have 3 octal digits." );
	  read_next_char();
	}
	if (! g_unichar_validate( code ))
	  complain( "Escape sequence defines invalid character." );
	add_unichar_to_text( token_text, code );
      }
      else 
	complain( "Illegal escape sequence in string." );
    }
  }
  read_next_char(); /* Read over final '"'. */
  free_mem( &token_string ); /* Free old token string. */
  token_string = new_string( token_text->buffer, NULL );
}

/*---------------------------------------------------------------------------*/

void 
read_next_token( void )
/* Read the next token from current source into NEXT_TOKEN.
 * If end of input stream is reached, return EOF. */
{
  /* Read chars until a token has been recognised. */
  while (TRUE) 
  { 
    switch (next_char) 
    {
    case EOF:
      next_token = EOF;
      return;
    case ' ': 
    case '\t': 
    case '\n': /* Read over whitespace. */
      read_next_char();
      break;
    case '\r':
      read_next_char();
      if (next_char != '\n')
	complain( "Carriage return without line feed." );
      read_next_char();
      break;
    case '#': /* Read over a comment. */
      do 
      { 
	read_next_char(); 
      } while (next_char != '\n' && next_char != EOF);
      break;
    case '\"': /* Read a string. */
      read_string();
      next_token = TOK_STRING;
      return;
    case ':': /* Read a ":", ":=", ":=+", ":=-", ":=*", ":=/". */
      read_next_char();
      if (next_char == '=') 
      { 
	read_next_char();
        if (next_char == '+') 
	{ 
	  next_token = TOK_ASSIGN_PLUS;
	  read_next_char();
        } 
	else if (next_char == '-') 
	{ 
	  next_token = TOK_ASSIGN_MINUS;
          read_next_char();
        }
	else if (next_char == '*') 
	{ 
	  next_token = TOK_ASSIGN_ASTERISK;
          read_next_char();
        } 
	else if (next_char == '/') 
	{ 
	  next_token = TOK_ASSIGN_SLASH;
          read_next_char();
        } 
	else 
	  next_token = TOK_ASSIGN;
      } 
      else 
	next_token = ':';
      return;
    case '/': /* Read a "/", a "/=" or a "/~". */
      read_next_char();
      if (next_char == '=') 
      { 
	next_token = TOK_NOT_EQUAL;
        read_next_char();
      } 
      else if (next_char == '~') 
      { 
        next_token = TOK_NOT_CONGRUENT;
        read_next_char();
      } 
      else 
	next_token = '/';
      return;
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9': 
      /* Read a number. */
      read_number();
      next_token = TOK_NUMBER;
      return;
    case '$':
      read_next_char();
      read_name();
      next_token = TOK_VARIABLE;
      return;
    case '@':
      read_next_char();
      read_name();
      next_token = TOK_CONSTANT;
      return;
    default: 
    if (g_unichar_isalpha( next_char ) 
        || next_char == '_' || next_char == '&' || next_char == '|') 
      { 
	read_name();
        next_token = keyword_code( token_name );
        return;
      } 
      else 
      { 
	next_token = next_char;
        read_next_char();
        return;
      }
    }
  }
}

/*---------------------------------------------------------------------------*/

string_t 
token_as_text( int_t token )
/* Return TOKEN as a string readable for humans.
 * The string must be freed after use. */
{
  int_t i;
  char token_buffer[2];

  /* Look if TOKEN is a keyword. */
  for (i = 0; i < NUMBER_OF_KEYWORDS; i++) 
  { 
    if (keywords[i].code == token) 
      return concat_strings( "\"", keywords[i].name, "\"", NULL );
  }
  
  switch (token) 
  {
  case EOF: 
    return new_string( "end of input", NULL ); 
  case TOK_STRING: 
    return new_string( "string", NULL );
  case TOK_IDENT: 
    return new_string( "identifier", NULL );
  case TOK_VARIABLE: 
    return new_string( "variable", NULL );
  case TOK_CONSTANT: 
    return new_string( "constant", NULL );
  case TOK_NUMBER: 
    return new_string( "number", NULL );
  case TOK_ASSIGN: 
    return new_string_readable( ":=", NULL );
  case TOK_ASSIGN_PLUS: 
    return new_string_readable( ":=+", NULL );
  case TOK_ASSIGN_MINUS: 
    return new_string_readable( ":=-", NULL );
  case TOK_ASSIGN_ASTERISK: 
    return new_string_readable( ":=*", NULL );
  case TOK_ASSIGN_SLASH: 
    return new_string_readable( ":=/", NULL );
  case TOK_NOT_EQUAL: 
    return new_string_readable( "/=", NULL );
  case TOK_NOT_CONGRUENT: 
    return new_string_readable( "/~", NULL );
  default:
    token_buffer[0] = token;
    token_buffer[1] = EOS;
    return new_string_readable( token_buffer, NULL );
  }
}

/*---------------------------------------------------------------------------*/

void 
test_token( int_t token )
/* Test if TOKEN is the next token. If it's not, report an error. */
{
  if (next_token != token) 
  { 
    complain( "Expected %s, not %s.", 
	      token_as_text( token ), token_as_text( next_token ) );
  }
}

/*---------------------------------------------------------------------------*/

void 
parse_token( int_t token )
/* Test if TOKEN is the next token and read next token. */
{
  test_token( token );
  read_next_token();
}

/* End of file. =============================================================*/