Blob Blame History Raw
/* xgettext sh backend.
   Copyright (C) 2003, 2005-2009, 2015 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2003.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

/* Specification.  */
#include "x-sh.h"

#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "message.h"
#include "xgettext.h"
#include "error.h"
#include "xalloc.h"
#include "hash.h"
#include "gettext.h"

#define _(s) gettext(s)

#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


/* The sh syntax is defined in POSIX:2001, see
     http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
   Summary of sh syntax:
   - Input is broken into words, which are then subject to
     - tilde expansion ~...
     - command substitution `...`
     - variable substitution $var
     - arithmetic substitution $((...))
     - field splitting at whitespace (IFS)
     - wildcard pattern expansion *?
     - quote removal
   - Strings are enclosed in "..."; command substitution, variable
     substitution and arithmetic substitution are performed here as well.
   - '...' is a string without substitutions.
   - The list of resulting words is split into commands by semicolon and
     newline.
   - '#' at the beginning of a word introduces a comment until end of line.
   The parser is implemented in bash-2.05b/parse.y.  */


/* ====================== Keyword set customization.  ====================== */

/* If true extract all strings.  */
static bool extract_all = false;

static hash_table keywords;
static bool default_keywords = true;


void
x_sh_extract_all ()
{
  extract_all = true;
}


void
x_sh_keyword (const char *name)
{
  if (name == NULL)
    default_keywords = false;
  else
    {
      const char *end;
      struct callshape shape;
      const char *colon;

      if (keywords.table == NULL)
        hash_init (&keywords, 100);

      split_keywordspec (name, &end, &shape);

      /* The characters between name and end should form a valid C identifier.
         A colon means an invalid parse in split_keywordspec().  */
      colon = strchr (name, ':');
      if (colon == NULL || colon >= end)
        insert_keyword_callshape (&keywords, name, end - name, &shape);
    }
}

/* Finish initializing the keywords hash table.
   Called after argument processing, before each file is processed.  */
static void
init_keywords ()
{
  if (default_keywords)
    {
      /* When adding new keywords here, also update the documentation in
         xgettext.texi!  */
      x_sh_keyword ("gettext");
      x_sh_keyword ("ngettext:1,2");
      x_sh_keyword ("eval_gettext");
      x_sh_keyword ("eval_ngettext:1,2");
      default_keywords = false;
    }
}

void
init_flag_table_sh ()
{
  xgettext_record_flag ("gettext:1:pass-sh-format");
  xgettext_record_flag ("ngettext:1:pass-sh-format");
  xgettext_record_flag ("ngettext:2:pass-sh-format");
  xgettext_record_flag ("eval_gettext:1:sh-format");
  xgettext_record_flag ("eval_ngettext:1:sh-format");
  xgettext_record_flag ("eval_ngettext:2:sh-format");
}


/* ======================== Reading of characters.  ======================== */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* Logical filename and line number, used to label the extracted messages.  */
static char *logical_file_name;
static int line_number;

/* The input file stream.  */
static FILE *fp;


/* Fetch the next character from the input file.  */
static int
do_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
        error (EXIT_FAILURE, errno, _("\
error while reading \"%s\""), real_file_name);
    }
  else if (c == '\n')
   line_number++;

  return c;
}

/* Put back the last fetched character, not EOF.  */
static void
do_ungetc (int c)
{
  if (c == '\n')
    line_number--;
  ungetc (c, fp);
}


/* Remove backslash followed by newline from the input stream.  */

static int phase1_pushback[1];
static int phase1_pushback_length;

static int
phase1_getc ()
{
  int c;

  if (phase1_pushback_length)
    {
      c = phase1_pushback[--phase1_pushback_length];
      if (c == '\n')
        ++line_number;
      return c;
    }
  for (;;)
    {
      c = do_getc ();
      if (c != '\\')
        return c;
      c = do_getc ();
      if (c != '\n')
        {
          if (c != EOF)
            do_ungetc (c);
          return '\\';
        }
    }
}

/* Supports only one pushback character.  */
static void
phase1_ungetc (int c)
{
  switch (c)
    {
    case EOF:
      break;

    case '\n':
      --line_number;
      /* FALLTHROUGH */

    default:
      if (phase1_pushback_length == SIZEOF (phase1_pushback))
        abort ();
      phase1_pushback[phase1_pushback_length++] = c;
      break;
    }
}


/* ========================== Reading of tokens.  ========================== */


/* A token consists of a sequence of characters.  */
struct token
{
  int allocated;                /* number of allocated 'token_char's */
  int charcount;                /* number of used 'token_char's */
  char *chars;                  /* the token's constituents */
};

/* Initialize a 'struct token'.  */
static inline void
init_token (struct token *tp)
{
  tp->allocated = 10;
  tp->chars = XNMALLOC (tp->allocated, char);
  tp->charcount = 0;
}

/* Free the memory pointed to by a 'struct token'.  */
static inline void
free_token (struct token *tp)
{
  free (tp->chars);
}

/* Ensure there is enough room in the token for one more character.  */
static inline void
grow_token (struct token *tp)
{
  if (tp->charcount == tp->allocated)
    {
      tp->allocated *= 2;
      tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
    }
}

/* Convert a struct token * to a char*.  */
static char *
string_of_token (const struct token *tp)
{
  char *str;
  int n;

  n = tp->charcount;
  str = XNMALLOC (n + 1, char);
  memcpy (str, tp->chars, n);
  str[n] = '\0';
  return str;
}


/* ========================= Accumulating messages ========================= */


static message_list_ty *mlp;


/* ========================= Accumulating comments ========================= */


static char *buffer;
static size_t bufmax;
static size_t buflen;

static inline void
comment_start ()
{
  buflen = 0;
}

static inline void
comment_add (int c)
{
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen++] = c;
}

static inline void
comment_line_end ()
{
  while (buflen >= 1
         && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    --buflen;
  if (buflen >= bufmax)
    {
      bufmax = 2 * bufmax + 10;
      buffer = xrealloc (buffer, bufmax);
    }
  buffer[buflen] = '\0';
  savable_comment_add (buffer);
}


/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* ========================= Debackslashification ========================== */

/* This state tracks the effect of backquotes, double-quotes and single-quotes
   on the parsing of backslashes.  We make a single pass through the input
   file, keeping the state up to date.  This is much faster than accumulating
   strings and processing them with explicit debackslashification, like the
   shell does it.  */

/* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
static unsigned int nested_backquotes;

/* A bit mask indicating which of the currently open `...` or "`...`"
   constructs is with double-quotes: "`...`".
   A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
   Bit position 0 designates the outermost backquotes nesting,
   bit position 1 the second-outermost backquotes nesting,
   ...
   bit position (nested_backquotes-1) the innermost backquotes nesting.  */
static unsigned int open_doublequotes_mask;

/* A bit indicating whether a double-quote is currently open inside the
   innermost backquotes nesting.  */
static bool open_doublequote;

/* A bit indicating whether a single-quote is currently open inside the
   innermost backquotes nesting.  */
static bool open_singlequote;

/* The expected terminator of the currently open single-quote.
   Usually '\'', but can be '"' for i18n-quotes.  */
static char open_singlequote_terminator;


/* Functions to update the state.  */

static inline void
saw_opening_backquote ()
{
  if (open_singlequote)
    abort ();
  if (open_doublequote)
    open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
  nested_backquotes++;
  open_doublequote = false;
}

static inline void
saw_closing_backquote ()
{
  nested_backquotes--;
  open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
  open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
  open_singlequote = false; /* just for safety */
}

static inline void
saw_opening_doublequote ()
{
  if (open_singlequote || open_doublequote)
    abort ();
  open_doublequote = true;
}

static inline void
saw_closing_doublequote ()
{
  if (open_singlequote || !open_doublequote)
    abort ();
  open_doublequote = false;
}

static inline void
saw_opening_singlequote ()
{
  if (open_doublequote || open_singlequote)
    abort ();
  open_singlequote = true;
  open_singlequote_terminator = '\'';
}

static inline void
saw_closing_singlequote ()
{
  if (open_doublequote || !open_singlequote)
    abort ();
  open_singlequote = false;
}


/* ========================== Reading of commands ========================== */

/* We are only interested in constant strings.  Other words need not to be
   represented precisely.  */
enum word_type
{
  t_string,     /* constant string */
  t_other,      /* other string */
  t_separator,  /* command separator: semicolon or newline */
  t_redirect,   /* redirection: one of < > >| << <<- >> <> <& >& */
  t_backquote,  /* closing '`' pseudo word */
  t_paren,      /* closing ')' pseudo word */
  t_eof         /* EOF marker */
};

struct word
{
  enum word_type type;
  struct token *token;          /* for t_string */
  int line_number_at_start;     /* for t_string */
};

/* Free the memory pointed to by a 'struct word'.  */
static inline void
free_word (struct word *wp)
{
  if (wp->type == t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
}

/* Convert a t_string token to a char*.  */
static char *
string_of_word (const struct word *wp)
{
  char *str;
  int n;

  if (!(wp->type == t_string))
    abort ();
  n = wp->token->charcount;
  str = XNMALLOC (n + 1, char);
  memcpy (str, wp->token->chars, n);
  str[n] = '\0';
  return str;
}


/* Whitespace recognition.  */

static inline bool
is_whitespace (int c)
{
  return (c == ' ' || c == '\t' || c == '\n');
}

/* Operator character recognition.  */

static inline bool
is_operator_start (int c)
{
  return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
          || c == '(' || c == ')');
}


/* Denotation of a quoted character.
   The distinction between quoted and unquoted character is important only for
   the special, whitespace and operator characters; it is irrelevant for
   alphanumeric characters, '\\' and many others.  */
#define QUOTED(c) (UCHAR_MAX + 1 + (c))
/* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
   the following are important:
     '"'         opening or closing double quote
     '\''        opening or closing single quote
     '$'         the unknown result of a dollar expansion
     '`'         does not occur - replaced with OPENING_BACKQUOTE or
                 CLOSING_BACKQUOTE
 */
#define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
#define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')

/* 2 characters of pushback are supported.
   2 characters of pushback occur only when the first is an 'x'; in all
   other cases only one character of pushback is needed.  */
static int phase2_pushback[2];
static int phase2_pushback_length;

/* Return the next character, with backslashes removed.
   The result is QUOTED(c) for some unsigned char c, if the next character
   is escaped sufficiently often to make it a regular constituent character,
   or simply an 'unsigned char' if it has its special meaning (of special,
   whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
   EOF.
   It's the caller's responsibility to update the state.  */
static int
phase2_getc ()
{
  int c;

  if (phase2_pushback_length)
    {
      c = phase2_pushback[--phase2_pushback_length];
      if (c == '\n')
        ++line_number;
      return c;
    }

  c = phase1_getc ();
  if (c == EOF)
    return c;
  if (c == '\'')
    return ((open_doublequote
             || (open_singlequote && open_singlequote_terminator != c))
            ? QUOTED (c)
            : c);
  if (open_singlequote)
    {
      if (c == open_singlequote_terminator)
        return c;
    }
  else
    {
      if (c == '"' || c == '$')
        return c;
      if (c == '`')
        return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
    }
  if (c == '\\')
    {
      /* Number of debackslashification passes that are active at the
         current point.  */
      unsigned int debackslashify =
        nested_backquotes + (open_singlequote ? 0 : 1);
      /* Normal number of backslashes that yield a single backslash in the
         final output.  */
      unsigned int expected_count =
        (unsigned int) 1 << debackslashify;
      /* Number of backslashes found.  */
      unsigned int count;

      for (count = 1; count < expected_count; count++)
        {
          c = phase1_getc ();
          if (c != '\\')
            break;
        }
      if (count == expected_count)
        return '\\';

      /* The count of backslashes is > 0 and < expected_count, therefore the
         result depends on c, the first character after the backslashes.
         Note: The formulas below don't necessarily have a logic; they were
         empirically determined such that 1. the xgettext-30 test succeeds,
         2. the behaviour for count == 0 would correspond to the one without
         any baskslash.  */
      if (c == '\'')
        {
          if (!open_singlequote && count > (expected_count >> 1))
            {
              phase1_ungetc (c);
              return '\\';
            }
          else
            return ((open_doublequote
                     || (open_singlequote && open_singlequote_terminator != c))
                    ? QUOTED (c)
                    : c);
        }
      else if (c == '"')
        {
          /* Each debackslashification pass converts \\ to \ and \" to ";
             passes corresponding to `...` drop a lone " whereas passes
             corresponding to "`...`" leave it alone.  Therefore, the
             minimum number of backslashes needed to get one double-quote
             in the end is  open_doublequotes_mask + 1.  */
          if (open_singlequote)
            {
              if (count > open_doublequotes_mask)
                {
                  phase1_ungetc (c);
                  return '\\';
                }
              else
                return (open_singlequote_terminator != c ? QUOTED (c) : c);
            }
          else
            {
              if (count > open_doublequotes_mask)
                return QUOTED (c);
              else
                /* Some of the count values <= open_doublequotes_mask are
                   actually invalid here, but we assume a syntactically
                   correct input file anyway.  */
                return c;
            }
        }
      else if (c == '`')
        {
          /* FIXME: This code looks fishy.  */
          if (count == expected_count - 1)
            return c;
          else
            /* Some of the count values < expected_count - 1 are
               actually invalid here, but we assume a syntactically
               correct input file anyway.  */
            if (nested_backquotes > 0 && !open_singlequote
                && count >= (expected_count >> 2))
              return OPENING_BACKQUOTE;
            else
              return CLOSING_BACKQUOTE;
        }
      else if (c == '$')
        {
          if (open_singlequote)
            return QUOTED (c);
          if (count >= (expected_count >> 1))
            return QUOTED (c);
          else
            return c;
        }
      else
        {
          /* When not followed by a quoting character or backslash or dollar,
             a backslash survives a debackslashification pass unmodified.
             Therefore each debackslashification pass performs a
               count := (count + 1) >> 1
             operation.  Therefore the minimum number of backslashes needed
             to get one backslash in the end is  (expected_count >> 1) + 1.  */
          if (open_doublequote || open_singlequote)
            {
              if (count > 0)
                {
                  phase1_ungetc (c);
                  return '\\';
                }
              else
                return QUOTED (c);
            }
          else
            {
              if (count > (expected_count >> 1))
                {
                  phase1_ungetc (c);
                  return '\\';
                }
              else if (count > 0)
                return QUOTED (c);
              else
                return c;
            }
        }
    }

  return (open_singlequote || open_doublequote ? QUOTED (c) : c);
}

/* Supports 2 characters of pushback.  */
static void
phase2_ungetc (int c)
{
  switch (c)
    {
    case EOF:
      break;

    case '\n':
      --line_number;
      /* FALLTHROUGH */

    default:
      if (phase2_pushback_length == SIZEOF (phase2_pushback))
        abort ();
      phase2_pushback[phase2_pushback_length++] = c;
      break;
    }
}


/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;


/* Forward declaration of local functions.  */
static enum word_type read_command_list (int looking_for,
                                         flag_context_ty outer_context);



/* Read the next word.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.  */
static void
read_word (struct word *wp, int looking_for, flag_context_ty context)
{
  int c;
  bool all_unquoted_digits;

  do
    {
      c = phase2_getc ();
      if (c == '#')
        {
          /* Skip a comment up to end of line.  */
          last_comment_line = line_number;
          comment_start ();
          for (;;)
            {
              c = phase1_getc ();
              if (c == EOF || c == '\n')
                break;
              /* We skip all leading white space, but not EOLs.  */
              if (!(buflen == 0 && (c == ' ' || c == '\t')))
                comment_add (c);
            }
          comment_line_end ();
        }
      if (c == '\n')
        {
          /* Comments assumed to be grouped with a message must immediately
             precede it, with no non-whitespace token on a line between
             both.  */
          if (last_non_comment_line > last_comment_line)
            savable_comment_reset ();
          wp->type = t_separator;
          return;
        }
    }
  while (is_whitespace (c));

  if (c == EOF)
    {
      wp->type = t_eof;
      return;
    }

  if (c == '<' || c == '>')
    {
      /* Recognize the redirection operators < > >| << <<- >> <> <& >&
         But <( and >) are handled below, not here.  */
      int c2 = phase2_getc ();
      if (c2 != '(')
        {
          if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
            {
              if (c == '<' && c2 == '<')
                {
                  int c3 = phase2_getc ();
                  if (c3 != '-')
                    phase2_ungetc (c3);
                }
            }
          else
            phase2_ungetc (c2);
          wp->type = t_redirect;
          return;
        }
      else
        phase2_ungetc (c2);
    }

  if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
    {
      saw_closing_backquote ();
      wp->type = t_backquote;
      last_non_comment_line = line_number;
      return;
    }

  if (looking_for == ')' && c == ')')
    {
      wp->type = t_paren;
      last_non_comment_line = line_number;
      return;
    }

  if (is_operator_start (c))
    {
      wp->type = (c == ';' ? t_separator : t_other);
      return;
    }

  wp->type = t_string;
  wp->token = XMALLOC (struct token);
  init_token (wp->token);
  wp->line_number_at_start = line_number;
  all_unquoted_digits = true;

  for (;; c = phase2_getc ())
    {
      if (c == EOF)
        break;

      if (all_unquoted_digits && (c == '<' || c == '>'))
        {
          /* Recognize the redirection operators < > >| << <<- >> <> <& >&
             prefixed with a nonempty sequence of unquoted digits.  */
          int c2 = phase2_getc ();
          if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
            {
              if (c == '<' && c2 == '<')
                {
                  int c3 = phase2_getc ();
                  if (c3 != '-')
                    phase2_ungetc (c3);
                }
            }
          else
            phase2_ungetc (c2);

          wp->type = t_redirect;
          free_token (wp->token);
          free (wp->token);

          last_non_comment_line = line_number;

          return;
        }

      all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');

      if (c == '$')
        {
          int c2;

          /* An unquoted dollar indicates we are not inside '...'.  */
          if (open_singlequote)
            abort ();
          /* After reading a dollar, we know that there is no pushed back
             character from an earlier lookahead.  */
          if (phase2_pushback_length > 0)
            abort ();
          /* Therefore we can use phase1 without interfering with phase2.
             We need to recognize $( outside and inside double-quotes.
             It would be incorrect to do
                c2 = phase2_getc ();
                if (c2 == '(' || c2 == QUOTED ('('))
             because that would also trigger for $\(.  */
          c2 = phase1_getc ();
          if (c2 == '(')
            {
              bool saved_open_doublequote;
              int c3;

              phase1_ungetc (c2);

              /* The entire inner command or arithmetic expression is read
                 ignoring possible surrounding double-quotes.  */
              saved_open_doublequote = open_doublequote;
              open_doublequote = false;

              c2 = phase2_getc ();
              if (c2 != '(')
                abort ();

              c3 = phase2_getc ();
              if (c3 == '(')
                {
                  /* Arithmetic expression (Bash syntax).  Skip until the
                     matching closing parenthesis.  */
                  unsigned int depth = 2;

                  do
                    {
                      c = phase2_getc ();
                      if (c == '(')
                        depth++;
                      else if (c == ')')
                        if (--depth == 0)
                          break;
                    }
                  while (c != EOF);
                }
              else
                {
                  /* Command substitution (Bash syntax).  */
                  phase2_ungetc (c3);
                  read_command_list (')', context);
                }

              open_doublequote = saved_open_doublequote;
            }
          else
            {
              phase1_ungetc (c2);
              c2 = phase2_getc ();

              if (c2 == '\'' && !open_singlequote)
                {
                  /* Bash builtin for string with ANSI-C escape sequences.  */
                  for (;;)
                    {
                      /* We have to use phase1 throughout this loop,
                         because phase2 does debackslashification,
                         which is undesirable when parsing ANSI-C
                         escape sequences.  */
                      c = phase1_getc ();
                      if (c == EOF)
                        break;
                      if (c == '\'')
                        break;
                      if (c == '\\')
                        {
                          c = phase1_getc ();
                          switch (c)
                            {
                            default:
                              phase1_ungetc (c);
                              c = '\\';
                              break;

                            case '\\':
                              break;
                            case '\'':
                              break;
                            case '"':
                              break;

                            case 'a':
                              c = '\a';
                              break;
                            case 'b':
                              c = '\b';
                              break;
                            case 'e':
                            case 'E':
                              c = 0x1b; /* ESC */
                              break;
                            case 'f':
                              c = '\f';
                              break;
                            case 'n':
                              c = '\n';
                              break;
                            case 'r':
                              c = '\r';
                              break;
                            case 't':
                              c = '\t';
                              break;
                            case 'v':
                              c = '\v';
                              break;

                            case 'x':
                              c = phase1_getc ();
                              if ((c >= '0' && c <= '9')
                                  || (c >= 'A' && c <= 'F')
                                  || (c >= 'a' && c <= 'f'))
                                {
                                  int n;

                                  if (c >= '0' && c <= '9')
                                    n = c - '0';
                                  else if (c >= 'A' && c <= 'F')
                                    n = 10 + c - 'A';
                                  else if (c >= 'a' && c <= 'f')
                                    n = 10 + c - 'a';
                                  else
                                    abort ();

                                  c = phase1_getc ();
                                  if ((c >= '0' && c <= '9')
                                      || (c >= 'A' && c <= 'F')
                                      || (c >= 'a' && c <= 'f'))
                                    {
                                      if (c >= '0' && c <= '9')
                                        n = n * 16 + c - '0';
                                      else if (c >= 'A' && c <= 'F')
                                        n = n * 16 + 10 + c - 'A';
                                      else if (c >= 'a' && c <= 'f')
                                        n = n * 16 + 10 + c - 'a';
                                      else
                                        abort ();
                                    }
                                  else
                                    phase1_ungetc (c);

                                  c = n;
                                }
                              else
                                {
                                  phase1_ungetc (c);
                                  phase1_ungetc ('x');
                                  c = '\\';
                                }
                              break;

                            case '0': case '1': case '2': case '3':
                            case '4': case '5': case '6': case '7':
                              {
                                int n = c - '0';

                                c = phase1_getc ();
                                if (c >= '0' && c <= '7')
                                  {
                                    n = n * 8 + c - '0';

                                    c = phase1_getc ();
                                    if (c >= '0' && c <= '7')
                                      n = n * 8 + c - '0';
                                    else
                                      phase1_ungetc (c);
                                  }
                                else
                                  phase1_ungetc (c);

                                c = n;
                              }
                              break;
                            }
                        }
                      if (wp->type == t_string)
                        {
                          grow_token (wp->token);
                          wp->token->chars[wp->token->charcount++] =
                            (unsigned char) c;
                        }
                    }
                  /* The result is a literal string.  Don't change wp->type.  */
                  continue;
                }
              else if (c2 == '"' && !open_doublequote)
                {
                  /* Bash builtin for internationalized string.  */
                  lex_pos_ty pos;
                  struct token string;

                  saw_opening_singlequote ();
                  open_singlequote_terminator = '"';
                  pos.file_name = logical_file_name;
                  pos.line_number = line_number;
                  init_token (&string);
                  for (;;)
                    {
                      c = phase2_getc ();
                      if (c == EOF)
                        break;
                      if (c == '"')
                        {
                          saw_closing_singlequote ();
                          break;
                        }
                      grow_token (&string);
                      string.chars[string.charcount++] = (unsigned char) c;
                    }
                  remember_a_message (mlp, NULL, string_of_token (&string),
                                      context, &pos, NULL, savable_comment);
                  free_token (&string);

                  error_with_progname = false;
                  error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
                         pos.file_name, (unsigned long) pos.line_number);
                  error_with_progname = true;

                  /* The result at runtime is not constant. Therefore we
                     change wp->type.  */
                }
              else
                phase2_ungetc (c2);
            }
          wp->type = t_other;
          continue;
        }

      if (c == '\'')
        {
          if (!open_singlequote)
            {
              /* Handle an opening single quote.  */
              saw_opening_singlequote ();
            }
          else
            {
              /* Handle a closing single quote.  */
              saw_closing_singlequote ();
            }
          continue;
        }

      if (c == '"')
        {
          if (open_singlequote && open_singlequote_terminator == '"')
            {
              /* Handle a closing i18n quote.  */
              saw_closing_singlequote ();
            }
          else if (!open_doublequote)
            {
              /* Handle an opening double quote.  */
              saw_opening_doublequote ();
            }
          else
            {
              /* Handle a closing double quote.  */
              saw_closing_doublequote ();
            }
          continue;
        }

      if (c == OPENING_BACKQUOTE)
        {
          /* Handle an opening backquote.  */
          saw_opening_backquote ();

          read_command_list (CLOSING_BACKQUOTE, context);

          wp->type = t_other;
          continue;
        }
      if (c == CLOSING_BACKQUOTE)
        break;

      if (c == '<' || c == '>')
        {
          int c2;

          /* An unquoted c indicates we are not inside '...' nor "...".  */
          if (open_singlequote || open_doublequote)
            abort ();

          c2 = phase2_getc ();
          if (c2 == '(')
            {
              /* Process substitution (Bash syntax).  */
              read_command_list (')', context);

              wp->type = t_other;
              continue;
            }
          else
            phase2_ungetc (c2);
        }

      if (!open_singlequote && !open_doublequote
          && (is_whitespace (c) || is_operator_start (c)))
        break;

      if (wp->type == t_string)
        {
          grow_token (wp->token);
          wp->token->chars[wp->token->charcount++] = (unsigned char) c;
        }
    }

  phase2_ungetc (c);

  if (wp->type != t_string)
    {
      free_token (wp->token);
      free (wp->token);
    }
  last_non_comment_line = line_number;
}


/* Read the next command.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.
   Returns the type of the word that terminated the command.  */
static enum word_type
read_command (int looking_for, flag_context_ty outer_context)
{
  /* Read the words that make up the command.
     Here we completely ignore field splitting at whitespace and wildcard
     expansions; i.e. we assume that the source is written in such a way that
     every word in the program determines exactly one word in the resulting
     command.
     But we do not require that the 'gettext'/'ngettext' command is the
     first in the command; this is because 1. we want to allow for prefixes
     like "$verbose" that may expand to nothing, and 2. it's a big effort
     to know where a command starts in a $(for ...) or $(case ...) compound
     command.  */
  int arg = 0;                  /* Current argument number.  */
  bool arg_of_redirect = false; /* True right after a redirection operator.  */
  flag_context_list_iterator_ty context_iter;
  const struct callshapes *shapes = NULL;
  struct arglist_parser *argparser = NULL;

  for (;;)
    {
      struct word inner;
      flag_context_ty inner_context;

      if (arg == 0)
        inner_context = null_context;
      else
        inner_context =
          inherited_context (outer_context,
                             flag_context_list_iterator_advance (
                               &context_iter));

      read_word (&inner, looking_for, inner_context);

      /* Recognize end of command.  */
      if (inner.type == t_separator
          || inner.type == t_backquote || inner.type == t_paren
          || inner.type == t_eof)
        {
          if (argparser != NULL)
            arglist_parser_done (argparser, arg);
          return inner.type;
        }

      if (extract_all)
        {
          if (inner.type == t_string)
            {
              lex_pos_ty pos;

              pos.file_name = logical_file_name;
              pos.line_number = inner.line_number_at_start;
              remember_a_message (mlp, NULL, string_of_word (&inner),
                                  inner_context, &pos, NULL, savable_comment);
            }
        }

      if (arg_of_redirect)
        {
          /* Ignore arguments of redirection operators.  */
          arg_of_redirect = false;
        }
      else if (inner.type == t_redirect)
        {
          /* Ignore this word and the following one.  */
          arg_of_redirect = true;
        }
      else
        {
          if (argparser == NULL)
            {
              /* This is the function position.  */
              arg = 0;
              if (inner.type == t_string)
                {
                  char *function_name = string_of_word (&inner);
                  void *keyword_value;

                  if (hash_find_entry (&keywords,
                                       function_name, strlen (function_name),
                                       &keyword_value)
                      == 0)
                    shapes = (const struct callshapes *) keyword_value;

                  argparser = arglist_parser_alloc (mlp, shapes);

                  context_iter =
                    flag_context_list_iterator (
                      flag_context_list_table_lookup (
                        flag_context_list_table,
                        function_name, strlen (function_name)));

                  free (function_name);
                }
              else
                context_iter = null_context_list_iterator;
            }
          else
            {
              /* These are the argument positions.  */
              if (inner.type == t_string)
                arglist_parser_remember (argparser, arg,
                                         string_of_word (&inner),
                                         inner_context,
                                         logical_file_name,
                                         inner.line_number_at_start,
                                         savable_comment);

              if (arglist_parser_decidedp (argparser, arg))
                {
                  /* Stop looking for arguments of the last function_name.  */
                  /* FIXME: What about context_iter?  */
                  arglist_parser_done (argparser, arg);
                  shapes = NULL;
                  argparser = NULL;
                }
            }

          arg++;
        }

      free_word (&inner);
    }
}


/* Read a list of commands.
   'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
   or '\0'.
   Returns the type of the word that terminated the command list.  */
static enum word_type
read_command_list (int looking_for, flag_context_ty outer_context)
{
  for (;;)
    {
      enum word_type terminator;

      terminator = read_command (looking_for, outer_context);
      if (terminator != t_separator)
        return terminator;
    }
}


void
extract_sh (FILE *f,
            const char *real_filename, const char *logical_filename,
            flag_context_list_table_ty *flag_table,
            msgdomain_list_ty *mdlp)
{
  mlp = mdlp->item[0]->messages;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

  last_comment_line = -1;
  last_non_comment_line = -1;

  nested_backquotes = 0;
  open_doublequotes_mask = 0;
  open_doublequote = false;
  open_singlequote = false;

  flag_context_list_table = flag_table;

  init_keywords ();

  /* Eat tokens until eof is seen.  */
  read_command_list ('\0', null_context);

  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
}