/* Copyright (C) 1995 Bjoern Beutel. */ /* Description. =============================================================*/ /* This module supports scanning (lexical analysis) of malaga source files. */ /* Includes. ================================================================*/ #include #include #include #include #include #include "basic.h" #include "files.h" #include "scanner.h" /* Constants. ===============================================================*/ /* List of all keywords and their token codes. * (This list must be maintained in alphabetical order.) */ static struct { string_t name; int_t code; } keywords[ NUMBER_OF_KEYWORDS ] = { { "accept", TOK_ACCEPT }, { "allo_rule", TOK_ALLO_RULE }, { "and", TOK_AND }, { "assert", TOK_ASSERT }, { "break", TOK_BREAK }, { "choose", TOK_CHOOSE }, { "combi_rule", TOK_COMBI_RULE }, { "continue", TOK_CONTINUE }, { "default", TOK_DEFAULT }, { "define", TOK_DEFINE }, { "else", TOK_ELSE }, { "elseif", TOK_ELSEIF }, { "end", TOK_END }, { "end_rule", TOK_END_RULE }, { "error", TOK_ERROR }, { "foreach", TOK_FOREACH }, { "greater", TOK_GREATER }, { "greater_equal", TOK_GREATER_EQUAL }, { "if", TOK_IF }, { "in", TOK_IN }, { "include", TOK_INCLUDE }, { "initial", TOK_INITIAL }, { "input_filter", TOK_INPUT_FILTER }, { "less", TOK_LESS }, { "less_equal", TOK_LESS_EQUAL }, { "matches", TOK_MATCHES }, { "not", TOK_NOT }, { "or", TOK_OR }, { "output_filter", TOK_OUTPUT_FILTER }, { "parallel", TOK_PARALLEL }, { "pruning_rule", TOK_PRUNING_RULE }, { "repeat", TOK_REPEAT }, { "require", TOK_REQUIRE }, { "result", TOK_RESULT }, { "return", TOK_RETURN }, { "robust_rule", TOK_ROBUST_RULE }, { "rules", TOK_RULES }, { "select", TOK_SELECT }, { "stop", TOK_STOP }, { "subrule", TOK_SUBRULE }, { "then", TOK_THEN }, { "while", TOK_WHILE } }; /* Types. ===================================================================*/ typedef struct /* A source stream for lexical analysis. */ { list_node_t *next; /* The next (including) source stream. */ FILE *stream; /* The input stream for this include level. */ string_t file_name; /* The name of the input file. */ text_t *line; /* The current line. */ string_t next_char_p; /* Pointer to the next char in LINE to be read. */ int_t column; /* Column that has been read. */ int_t line_number; /* Number of the line that has been read. */ int_t next_char; /* Buffer NEXT_CHAR if this source is backed up. */ int_t next_token; /* Buffer NEXT_TOKEN if this source is backed up. */ } source_t; /* Global variables. ========================================================*/ int_t next_token; string_t token_name; char_t *token_string; double token_number; /* Variables. ===============================================================*/ static list_t sources; /* The list of sources, current source first. */ static string_t scanner_input; /* If no file is included, the scanner reads its input from SCANNER_INPUT. */ static int_t next_char; /* The next unicode char to be read. */ static text_t *token_text; /* The text of the next token. */ /* Functions. ===============================================================*/ static void read_next_char( void ) /* Read the next char from input into NEXT_CHAR. * If end of input stream is reached, return EOF. * If no input stream is selected, read input from INPUT_BUFFER. * If reading from stream, update column information. */ { source_t *source; int_t c; source = (source_t *) sources.first; if (scanner_input != NULL) /* Read from a string. */ { if (*scanner_input == EOS) next_char = EOF; else { next_char = g_utf8_get_char( scanner_input ); scanner_input = g_utf8_next_char( scanner_input ); } } else if (source != NULL) /* Read from a file. */ { /* Read a new line if current line is empty. */ if (*source->next_char_p == EOS) { clear_text( source->line ); do { c = getc( source->stream ); if (c == EOS) complain( "Null byte in \"%s\"", source->file_name ); else if (c == EOF) { if (ferror( source->stream )) { complain( "Can't read from \"%s\": %s.", source->file_name, strerror( errno ) ); } else break; } else ADD_CHAR_TO_TEXT( source->line, c ); } while (c != '\n'); if (! g_utf8_validate( source->line->buffer, -1, NULL )) complain( "Illegal UTF-8 character in \"%s\".", source->file_name ); source->next_char_p = source->line->buffer; } if (*source->next_char_p == EOS) next_char = EOF; else { /* Get next char from current line. */ next_char = g_utf8_get_char( source->next_char_p ); source->next_char_p = g_utf8_next_char( source->next_char_p ); /* Update line and column information. */ if (next_char == '\t') source->column = (source->column + 8) & ~7; else if (next_char == '\n') { source->column = 0; source->line_number++; } else if (next_char == '\r') source->column = 0; else if (next_char != EOF) source->column++; } } else next_char = EOF; } /*---------------------------------------------------------------------------*/ string_t current_file_name( void ) /* Return the name of the file reading from or NULL. */ { source_t *source; source = (source_t *) sources.first; if (source == NULL) return NULL; return source->file_name; } /*---------------------------------------------------------------------------*/ int_t current_line_number( void ) /* Return the line number where the last char has been read or -1. */ { source_t *source; source = (source_t *) sources.first; if (source == NULL) return -1; return source->line_number; } /*---------------------------------------------------------------------------*/ int_t current_column( void ) /* Return the column where the last char has been read or -1. */ { source_t *source; source = (source_t *) sources.first; if (source == NULL) return -1; if (source->column == 0) return 0; return source->column - 1; /* Let columns start with 0. */ } /*---------------------------------------------------------------------------*/ void set_scanner_input( string_t input ) /* Make the scanner use INPUT as scanner input * until "set_scanner_input( NULL )" is called. * INPUT must remain valid until then. */ { source_t *source; source = (source_t *) sources.first; scanner_input = input; if (input != NULL) { if (source != NULL) { source->next_char = next_char; source->next_token = next_token; } read_next_char(); read_next_token(); } else if (source != NULL) { next_char = source->next_char; next_token = source->next_token; } } /*---------------------------------------------------------------------------*/ void begin_include( string_t file_name ) /* Open a new level of inclusion and read tokens from file FILE_NAME. */ { FILE *stream; source_t *source; source = (source_t *) sources.first; stream = open_stream( file_name, "r" ); /* Next char of old source should be read later. */ if (source != NULL) { source->next_char = next_char; source->next_token = next_token; } /* Create new source description. */ source = new_node( &sources, sizeof( source_t ), LIST_START ); source->line = new_text(); source->next_char_p = source->line->buffer; source->file_name = file_name; source->line_number = 1; source->column = 0; source->stream = stream; read_next_char(); read_next_token(); } /*---------------------------------------------------------------------------*/ void end_include( void ) /* Stop reading from current source stream and read from former stream. */ { source_t *source; source = (source_t *) sources.first; close_stream( &source->stream, source->file_name ); free_text( &source->line ); free_first_node( &sources ); if (sources.first != NULL) { source = (source_t *) sources.first; next_char = source->next_char; next_token = source->next_token; } } /*---------------------------------------------------------------------------*/ void end_includes( void ) /* Stop reading from all nested source streams. */ { while (sources.first != NULL) end_include(); } /*---------------------------------------------------------------------------*/ void init_scanner( void ) /* Initialise the scanner. */ { token_text = new_text(); } /*---------------------------------------------------------------------------*/ void terminate_scanner( void ) /* Terminate the scanner, even when it's scanning. */ { source_t *source; scanner_input = NULL; FOREACH_FREE( source, sources ) { close_stream( &source->stream, NULL ); free_text( &source->line ); } token_name = NULL; free_text( &token_text ); free_mem( &token_string ); } /*---------------------------------------------------------------------------*/ static void read_name( void ) /* Read rule name, variable, or keyword into TOKEN_NAME. */ { token_name = NULL; clear_text( token_text ); while (next_char != EOF && (g_unichar_isalnum( next_char ) || next_char == '_' || next_char == '&' || next_char == '|')) { add_unichar_to_text( token_text, next_char ); read_next_char(); } token_name = token_text->buffer; if (*token_name == EOS) complain( "Illegal character in name." ); } /*---------------------------------------------------------------------------*/ static int_t keyword_code( string_t name ) /* Look up NAME in the keyword table and return its token value. * If NAME is no keyword, return TOK_IDENT. */ { int_t lower, upper, middle, result; /* We do a binary search on the keywords. * A keyword must be in the range of keywords[ lower..upper ]. */ lower = 0; upper = NUMBER_OF_KEYWORDS - 1; while (lower <= upper) { middle = (lower + upper) / 2; result = strcmp_no_case( name, keywords[ middle ].name ); if (result < 0) upper = middle - 1; else if (result > 0) lower = middle + 1; else return keywords[ middle ].code; } return TOK_IDENT; } /*---------------------------------------------------------------------------*/ static void read_number( void ) /* Read a floating point number. Save its value in TOKEN_NUMBER. */ { token_name = NULL; clear_text( token_text ); while (next_char >= '0' && next_char <= '9') { add_char_to_text( token_text, next_char ); read_next_char(); } if (next_char == 'l' || next_char == 'L') read_next_char(); else if (next_char == 'r' || next_char == 'R') { insert_char_in_text( token_text, '-', 0 ); read_next_char(); } else { if (next_char == '.') { add_char_to_text( token_text, next_char ); read_next_char(); if (next_char < '0' || next_char >'9') complain( "Missing digits after \".\"." ); while (next_char >= '0' && next_char <= '9') { add_char_to_text( token_text, next_char ); read_next_char(); } } if (next_char == 'E' || next_char == 'e') { /* Read an exponent. */ add_char_to_text( token_text, next_char ); read_next_char(); if (next_char == '-' || next_char == '+') { add_char_to_text( token_text, next_char ); read_next_char(); } if (next_char < '0' || next_char > '9') complain( "Missing exponent." ); while (next_char >= '0' && next_char <= '9') { add_char_to_text( token_text, next_char ); read_next_char(); } } } if (sscanf( token_text->buffer, "%lf", &token_number ) != 1) complain( "Illegal number." ); } /*---------------------------------------------------------------------------*/ static void read_string( void ) /* Read a string. Save its value in TOKEN_STRING. */ { int_t i; u_int_t code; token_name = NULL; clear_text( token_text ); read_next_char(); /* Overread beginning '"'. */ while (next_char != '\"') { if (next_char == EOF || next_char == '\n') complain( "Unterminated string at end of line." ); if (next_char != '\\') { add_unichar_to_text( token_text, next_char ); read_next_char(); } else { read_next_char(); if (next_char == '\\' || next_char == '\"') { add_char_to_text( token_text, next_char ); read_next_char(); } else if (next_char >= '0' && next_char <= '7') { code = 0; for (i = 0; i < 3; i++) { if (next_char >= '0' && next_char <= '7') code = 8 * code + (next_char - '0'); else complain( "Escape sequence must have 3 octal digits." ); read_next_char(); } if (! g_unichar_validate( code )) complain( "Escape sequence defines invalid character." ); add_unichar_to_text( token_text, code ); } else complain( "Illegal escape sequence in string." ); } } read_next_char(); /* Read over final '"'. */ free_mem( &token_string ); /* Free old token string. */ token_string = new_string( token_text->buffer, NULL ); } /*---------------------------------------------------------------------------*/ void read_next_token( void ) /* Read the next token from current source into NEXT_TOKEN. * If end of input stream is reached, return EOF. */ { /* Read chars until a token has been recognised. */ while (TRUE) { switch (next_char) { case EOF: next_token = EOF; return; case ' ': case '\t': case '\n': /* Read over whitespace. */ read_next_char(); break; case '\r': read_next_char(); if (next_char != '\n') complain( "Carriage return without line feed." ); read_next_char(); break; case '#': /* Read over a comment. */ do { read_next_char(); } while (next_char != '\n' && next_char != EOF); break; case '\"': /* Read a string. */ read_string(); next_token = TOK_STRING; return; case ':': /* Read a ":", ":=", ":=+", ":=-", ":=*", ":=/". */ read_next_char(); if (next_char == '=') { read_next_char(); if (next_char == '+') { next_token = TOK_ASSIGN_PLUS; read_next_char(); } else if (next_char == '-') { next_token = TOK_ASSIGN_MINUS; read_next_char(); } else if (next_char == '*') { next_token = TOK_ASSIGN_ASTERISK; read_next_char(); } else if (next_char == '/') { next_token = TOK_ASSIGN_SLASH; read_next_char(); } else next_token = TOK_ASSIGN; } else next_token = ':'; return; case '/': /* Read a "/", a "/=" or a "/~". */ read_next_char(); if (next_char == '=') { next_token = TOK_NOT_EQUAL; read_next_char(); } else if (next_char == '~') { next_token = TOK_NOT_CONGRUENT; read_next_char(); } else next_token = '/'; return; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* Read a number. */ read_number(); next_token = TOK_NUMBER; return; case '$': read_next_char(); read_name(); next_token = TOK_VARIABLE; return; case '@': read_next_char(); read_name(); next_token = TOK_CONSTANT; return; default: if (g_unichar_isalpha( next_char ) || next_char == '_' || next_char == '&' || next_char == '|') { read_name(); next_token = keyword_code( token_name ); return; } else { next_token = next_char; read_next_char(); return; } } } } /*---------------------------------------------------------------------------*/ string_t token_as_text( int_t token ) /* Return TOKEN as a string readable for humans. * The string must be freed after use. */ { int_t i; char token_buffer[2]; /* Look if TOKEN is a keyword. */ for (i = 0; i < NUMBER_OF_KEYWORDS; i++) { if (keywords[i].code == token) return concat_strings( "\"", keywords[i].name, "\"", NULL ); } switch (token) { case EOF: return new_string( "end of input", NULL ); case TOK_STRING: return new_string( "string", NULL ); case TOK_IDENT: return new_string( "identifier", NULL ); case TOK_VARIABLE: return new_string( "variable", NULL ); case TOK_CONSTANT: return new_string( "constant", NULL ); case TOK_NUMBER: return new_string( "number", NULL ); case TOK_ASSIGN: return new_string_readable( ":=", NULL ); case TOK_ASSIGN_PLUS: return new_string_readable( ":=+", NULL ); case TOK_ASSIGN_MINUS: return new_string_readable( ":=-", NULL ); case TOK_ASSIGN_ASTERISK: return new_string_readable( ":=*", NULL ); case TOK_ASSIGN_SLASH: return new_string_readable( ":=/", NULL ); case TOK_NOT_EQUAL: return new_string_readable( "/=", NULL ); case TOK_NOT_CONGRUENT: return new_string_readable( "/~", NULL ); default: token_buffer[0] = token; token_buffer[1] = EOS; return new_string_readable( token_buffer, NULL ); } } /*---------------------------------------------------------------------------*/ void test_token( int_t token ) /* Test if TOKEN is the next token. If it's not, report an error. */ { if (next_token != token) { complain( "Expected %s, not %s.", token_as_text( token ), token_as_text( next_token ) ); } } /*---------------------------------------------------------------------------*/ void parse_token( int_t token ) /* Test if TOKEN is the next token and read next token. */ { test_token( token ); read_next_token(); } /* End of file. =============================================================*/