/* * field.c - routines for dealing with fields and record parsing */ /* * Copyright (C) 1986, 1988, 1989, 1991-2018 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Programming Language. * * GAWK is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * GAWK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ #include "awk.h" /* * In case that the system doesn't have isblank(). * Don't bother with autoconf ifdef junk, just force it. * See dfa.c and regex_internal.h and regcomp.c. Bleah. */ static int is_blank(int c) { return c == ' ' || c == '\t'; } typedef void (* Setfunc)(long, char *, long, NODE *); /* is the API currently overriding the default parsing mechanism? */ static bool api_parser_override = false; typedef long (*parse_field_func_t)(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static parse_field_func_t parse_field; /* * N.B. The normal_parse_field function pointer contains the parse_field value * that should be used except when API field parsing is overriding the default * field parsing mechanism. */ static parse_field_func_t normal_parse_field; static long re_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long def_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long null_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long sc_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static long fw_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static const awk_fieldwidth_info_t *api_fw = NULL; static long fpat_parse_field(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); static void set_element(long num, char * str, long len, NODE *arr); static void grow_fields_arr(long num); static void set_field(long num, char *str, long len, NODE *dummy); static void purge_record(void); static char *parse_extent; /* marks where to restart parse of record */ static long parse_high_water = 0; /* field number that we have parsed so far */ static long nf_high_water = 0; /* size of fields_arr */ static bool resave_fs; static NODE *save_FS; /* save current value of FS when line is read, * to be used in deferred parsing */ static awk_fieldwidth_info_t *FIELDWIDTHS = NULL; NODE **fields_arr; /* array of pointers to the field nodes */ bool field0_valid; /* $(>0) has not been changed yet */ int default_FS; /* true when FS == " " */ Regexp *FS_re_yes_case = NULL; Regexp *FS_re_no_case = NULL; Regexp *FS_regexp = NULL; Regexp *FPAT_re_yes_case = NULL; Regexp *FPAT_re_no_case = NULL; Regexp *FPAT_regexp = NULL; NODE *Null_field = NULL; /* init_fields --- set up the fields array to start with */ void init_fields() { emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields"); getnode(fields_arr[0]); *fields_arr[0] = *Nnull_string; fields_arr[0]->flags |= NULL_FIELD; parse_extent = fields_arr[0]->stptr; save_FS = dupnode(FS_node->var_value); getnode(Null_field); *Null_field = *Nnull_string; Null_field->valref = 1; Null_field->flags = (STRCUR|STRING|NULL_FIELD); /* do not set MALLOC */ field0_valid = true; } /* grow_fields --- acquire new fields as needed */ static void grow_fields_arr(long num) { int t; NODE *n; erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr"); for (t = nf_high_water + 1; t <= num; t++) { getnode(n); *n = *Null_field; fields_arr[t] = n; } nf_high_water = num; } /* set_field --- set the value of a particular field */ /*ARGSUSED*/ static void set_field(long num, char *str, long len, NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as set_element */ { NODE *n; if (num > nf_high_water) grow_fields_arr(num); n = fields_arr[num]; n->stptr = str; n->stlen = len; n->flags = (STRCUR|STRING|USER_INPUT); /* do not set MALLOC */ } /* rebuild_record --- Someone assigned a value to $(something). Fix up $0 to be right */ void rebuild_record() { /* * use explicit unsigned longs for lengths, in case * a size_t isn't big enough. */ unsigned long tlen; NODE *tmp; char *ops; char *cops; long i; assert(NF != -1); tlen = 0; for (i = NF; i > 0; i--) { tmp = fields_arr[i]; tmp = force_string(tmp); tlen += tmp->stlen; } tlen += (NF - 1) * OFSlen; if ((long) tlen < 0) tlen = 0; emalloc(ops, char *, tlen + 1, "rebuild_record"); cops = ops; ops[0] = '\0'; for (i = 1; i <= NF; i++) { free_wstr(fields_arr[i]); tmp = fields_arr[i]; /* copy field */ if (tmp->stlen == 1) *cops++ = tmp->stptr[0]; else if (tmp->stlen != 0) { memcpy(cops, tmp->stptr, tmp->stlen); cops += tmp->stlen; } /* copy OFS */ if (i != NF) { if (OFSlen == 1) *cops++ = *OFS; else if (OFSlen != 0) { memcpy(cops, OFS, OFSlen); cops += OFSlen; } } } tmp = make_str_node(ops, tlen, ALREADY_MALLOCED); /* * Since we are about to unref fields_arr[0], we want to find * any fields that still point into it, and have them point * into the new field zero. This has to be done intelligently, * so that unrefing a field doesn't try to unref into the old $0. */ for (cops = ops, i = 1; i <= NF; i++) { NODE *r = fields_arr[i]; /* * There is no reason to copy malloc'ed fields to point into * the new $0 buffer, although that's how previous versions did * it. It seems faster to leave the malloc'ed fields in place. */ if (r->stlen > 0 && (r->flags & MALLOC) == 0) { NODE *n; getnode(n); *n = *r; if (r->valref > 1) { /* * This can and does happen. It seems clear that * we can't leave r's stptr pointing into the * old $0 buffer that we are about to unref. */ emalloc(r->stptr, char *, r->stlen + 1, "rebuild_record"); memcpy(r->stptr, cops, r->stlen); r->stptr[r->stlen] = '\0'; r->flags |= MALLOC; n->valref = 1; // reset in the new field to start it off correctly! } n->stptr = cops; unref(r); fields_arr[i] = n; assert((n->flags & WSTRCUR) == 0); } cops += fields_arr[i]->stlen + OFSlen; } assert((fields_arr[0]->flags & MALLOC) == 0 ? fields_arr[0]->valref == 1 : true); unref(fields_arr[0]); fields_arr[0] = tmp; field0_valid = true; } /* * set_record: * setup $0, but defer parsing rest of line until reference is made to $(>0) * or to NF. At that point, parse only as much as necessary. * * Manage a private buffer for the contents of $0. Doing so keeps us safe * if `getline var' decides to rearrange the contents of the IOBUF that * $0 might have been pointing into. The cost is the copying of the buffer; * but better correct than fast. */ void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw) { NODE *n; static char *databuf; static unsigned long databuf_size; #define INITIAL_SIZE 512 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */ purge_record(); /* buffer management: */ if (databuf_size == 0) { /* first time */ ezalloc(databuf, char *, INITIAL_SIZE, "set_record"); databuf_size = INITIAL_SIZE; } /* * Make sure there's enough room. Since we sometimes need * to place a sentinel at the end, we make sure * databuf_size is > cnt after allocation. */ if (cnt >= databuf_size) { do { if (databuf_size > MAX_SIZE/2) fatal(_("input record too large")); databuf_size *= 2; } while (cnt >= databuf_size); erealloc(databuf, char *, databuf_size, "set_record"); memset(databuf, '\0', databuf_size); } /* copy the data */ memcpy(databuf, buf, cnt); /* * Add terminating '\0' so that C library routines * will know when to stop. */ databuf[cnt] = '\0'; /* manage field 0: */ assert((fields_arr[0]->flags & MALLOC) == 0 ? fields_arr[0]->valref == 1 : true); unref(fields_arr[0]); getnode(n); n->stptr = databuf; n->stlen = cnt; n->valref = 1; n->type = Node_val; n->stfmt = STFMT_UNUSED; #ifdef HAVE_MPFR n->strndmode = MPFR_round_mode; #endif n->flags = (STRING|STRCUR|USER_INPUT); /* do not set MALLOC */ fields_arr[0] = n; if (fw != api_fw) { if ((api_fw = fw) != NULL) { if (! api_parser_override) { api_parser_override = true; parse_field = fw_parse_field; update_PROCINFO_str("FS", "API"); } } else if (api_parser_override) { api_parser_override = false; parse_field = normal_parse_field; update_PROCINFO_str("FS", current_field_sep_str()); } } #undef INITIAL_SIZE #undef MAX_SIZE } /* reset_record --- start over again with current $0 */ void reset_record() { fields_arr[0] = force_string(fields_arr[0]); purge_record(); } static void purge_record() { int i; NF = -1; for (i = 1; i <= parse_high_water; i++) { NODE *n; NODE *r = fields_arr[i]; if ((r->flags & MALLOC) == 0 && r->valref > 1) { /* This can and does happen. We must copy the string! */ const char *save = r->stptr; emalloc(r->stptr, char *, r->stlen + 1, "purge_record"); memcpy(r->stptr, save, r->stlen); r->stptr[r->stlen] = '\0'; r->flags |= MALLOC; } unref(r); getnode(n); *n = *Null_field; fields_arr[i] = n; } parse_high_water = 0; /* * $0 = $0 should resplit using the current value of FS. */ if (resave_fs) { resave_fs = false; unref(save_FS); save_FS = dupnode(FS_node->var_value); } field0_valid = true; } /* set_NF --- handle what happens to $0 and fields when NF is changed */ void set_NF() { int i; long nf; NODE *n; assert(NF != -1); (void) force_number(NF_node->var_value); nf = get_number_si(NF_node->var_value); if (nf < 0) fatal(_("NF set to negative value")); NF = nf; if (NF > nf_high_water) grow_fields_arr(NF); if (parse_high_water < NF) { for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) { unref(fields_arr[i]); getnode(n); *n = *Null_field; fields_arr[i] = n; } parse_high_water = NF; } else if (parse_high_water > 0) { for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) { unref(fields_arr[i]); getnode(n); *n = *Null_field; fields_arr[i] = n; } parse_high_water = NF; } field0_valid = false; } /* * re_parse_field --- parse fields using a regexp. * * This is called both from get_field() and from do_split() * via (*parse_field)(). This variation is for when FS is a regular * expression -- either user-defined or because RS=="" and FS==" " */ static long re_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs ATTRIBUTE_UNUSED, Regexp *rp, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *sep_arr, /* array of field separators (maybe NULL) */ bool in_middle) { char *scan = *buf; long nf = parse_high_water; char *field; char *end = scan + len; int regex_flags = RE_NEED_START; char *sep; size_t mbclen = 0; mbstate_t mbs; memset(&mbs, 0, sizeof(mbstate_t)); if (in_middle) regex_flags |= RE_NO_BOL; if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; if (RS_is_null && default_FS) { sep = scan; while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n')) scan++; if (sep_arr != NULL && sep < scan) set_element(nf, sep, (long)(scan - sep), sep_arr); } if (rp == NULL) /* use FS */ rp = FS_regexp; field = scan; while (scan < end && research(rp, scan, 0, (end - scan), regex_flags) != -1 && nf < up_to) { regex_flags |= RE_NO_BOL; if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */ if (gawk_mb_cur_max > 1) { mbclen = mbrlen(scan, end-scan, &mbs); if ((mbclen == 1) || (mbclen == (size_t) -1) || (mbclen == (size_t) -2) || (mbclen == 0)) { /* We treat it as a singlebyte character. */ mbclen = 1; } scan += mbclen; } else scan++; if (scan == end) { (*set)(++nf, field, (long)(scan - field), n); up_to = nf; break; } continue; } (*set)(++nf, field, (long)(scan + RESTART(rp, scan) - field), n); if (sep_arr != NULL) set_element(nf, scan + RESTART(rp, scan), (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr); scan += REEND(rp, scan); field = scan; if (scan == end) /* FS at end of record */ (*set)(++nf, field, 0L, n); } if (nf != up_to && scan < end) { (*set)(++nf, scan, (long)(end - scan), n); scan = end; } *buf = scan; return nf; } /* * def_parse_field --- default field parsing. * * This is called both from get_field() and from do_split() * via (*parse_field)(). This variation is for when FS is a single space * character. */ static long def_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs, Regexp *rp ATTRIBUTE_UNUSED, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *sep_arr, /* array of field separators (maybe NULL) */ bool in_middle ATTRIBUTE_UNUSED) { char *scan = *buf; long nf = parse_high_water; char *field; char *end = scan + len; char sav; char *sep; if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; /* * Nasty special case. If FS set to "", return whole record * as first field. This is not worth a separate function. */ if (fs->stlen == 0) { (*set)(++nf, *buf, len, n); *buf += len; return nf; } /* before doing anything save the char at *end */ sav = *end; /* because it will be destroyed now: */ *end = ' '; /* sentinel character */ sep = scan; for (; nf < up_to; scan++) { /* * special case: fs is single space, strip leading whitespace */ while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n')) scan++; if (sep_arr != NULL && scan > sep) set_element(nf, sep, (long) (scan - sep), sep_arr); if (scan >= end) break; field = scan; while (*scan != ' ' && *scan != '\t' && *scan != '\n') scan++; (*set)(++nf, field, (long)(scan - field), n); if (scan == end) break; sep = scan; } /* everything done, restore original char at *end */ *end = sav; *buf = scan; return nf; } /* * null_parse_field --- each character is a separate field * * This is called both from get_field() and from do_split() * via (*parse_field)(). This variation is for when FS is the null string. */ static long null_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs ATTRIBUTE_UNUSED, Regexp *rp ATTRIBUTE_UNUSED, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *sep_arr, /* array of field separators (maybe NULL) */ bool in_middle ATTRIBUTE_UNUSED) { char *scan = *buf; long nf = parse_high_water; char *end = scan + len; if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; if (gawk_mb_cur_max > 1) { mbstate_t mbs; memset(&mbs, 0, sizeof(mbstate_t)); for (; nf < up_to && scan < end;) { size_t mbclen = mbrlen(scan, end-scan, &mbs); if ((mbclen == 1) || (mbclen == (size_t) -1) || (mbclen == (size_t) -2) || (mbclen == 0)) { /* We treat it as a singlebyte character. */ mbclen = 1; } if (sep_arr != NULL && nf > 0) set_element(nf, scan, 0L, sep_arr); (*set)(++nf, scan, mbclen, n); scan += mbclen; } } else { for (; nf < up_to && scan < end; scan++) { if (sep_arr != NULL && nf > 0) set_element(nf, scan, 0L, sep_arr); (*set)(++nf, scan, 1L, n); } } *buf = scan; return nf; } /* * sc_parse_field --- single character field separator * * This is called both from get_field() and from do_split() * via (*parse_field)(). This variation is for when FS is a single character * other than space. */ static long sc_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs, Regexp *rp ATTRIBUTE_UNUSED, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *sep_arr, /* array of field separators (maybe NULL) */ bool in_middle ATTRIBUTE_UNUSED) { char *scan = *buf; char fschar; long nf = parse_high_water; char *field; char *end = scan + len; char sav; size_t mbclen = 0; mbstate_t mbs; memset(&mbs, 0, sizeof(mbstate_t)); if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; if (RS_is_null && fs->stlen == 0) fschar = '\n'; else fschar = fs->stptr[0]; /* before doing anything save the char at *end */ sav = *end; /* because it will be destroyed now: */ *end = fschar; /* sentinel character */ for (; nf < up_to;) { field = scan; if (gawk_mb_cur_max > 1) { while (*scan != fschar) { mbclen = mbrlen(scan, end-scan, &mbs); if ((mbclen == 1) || (mbclen == (size_t) -1) || (mbclen == (size_t) -2) || (mbclen == 0)) { /* We treat it as a singlebyte character. */ mbclen = 1; } scan += mbclen; } } else { while (*scan != fschar) scan++; } (*set)(++nf, field, (long)(scan - field), n); if (scan == end) break; if (sep_arr != NULL) set_element(nf, scan, 1L, sep_arr); scan++; if (scan == end) { /* FS at end of record */ (*set)(++nf, field, 0L, n); break; } } /* everything done, restore original char at *end */ *end = sav; *buf = scan; return nf; } /* * calc_mbslen --- calculate the length in bytes of a multi-byte string * containing len characters. */ static size_t calc_mbslen(char *scan, char *end, size_t len, mbstate_t *mbs) { size_t mbclen; char *mbscan = scan; while (len-- > 0 && mbscan < end) { mbclen = mbrlen(mbscan, end - mbscan, mbs); if (!(mbclen > 0 && mbclen <= (size_t)(end - mbscan))) /* * We treat it as a singlebyte character. This should * catch error codes 0, (size_t) -1, and (size_t) -2. */ mbclen = 1; mbscan += mbclen; } return mbscan - scan; } /* * fw_parse_field --- field parsing using FIELDWIDTHS spec * * This is called from get_field() via (*parse_field)(). * This variation is for fields are fixed widths. */ static long fw_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs ATTRIBUTE_UNUSED, Regexp *rp ATTRIBUTE_UNUSED, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */ bool in_middle ATTRIBUTE_UNUSED) { char *scan = *buf; long nf = parse_high_water; char *end = scan + len; const awk_fieldwidth_info_t *fw; mbstate_t mbs; size_t skiplen; size_t flen; fw = (api_parser_override ? api_fw : FIELDWIDTHS); if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; if (gawk_mb_cur_max > 1 && fw->use_chars) { /* * Reset the shift state. Arguably, the shift state should * be part of the file state and carried forward at all times, * but nobody has complained so far, so this may not matter * in practice. */ memset(&mbs, 0, sizeof(mbstate_t)); while (nf < up_to && scan < end) { if (nf >= fw->nf) { *buf = end; return nf; } scan += calc_mbslen(scan, end, fw->fields[nf].skip, &mbs); flen = calc_mbslen(scan, end, fw->fields[nf].len, &mbs); (*set)(++nf, scan, (long) flen, n); scan += flen; } } else { while (nf < up_to && scan < end) { if (nf >= fw->nf) { *buf = end; return nf; } skiplen = fw->fields[nf].skip; if (skiplen > end - scan) skiplen = end - scan; scan += skiplen; flen = fw->fields[nf].len; if (flen > end - scan) flen = end - scan; (*set)(++nf, scan, (long) flen, n); scan += flen; } } *buf = scan; return nf; } /* invalidate_field0 --- $0 needs reconstruction */ void invalidate_field0() { field0_valid = false; } /* get_field --- return a particular $n */ /* assign is not NULL if this field is on the LHS of an assign */ NODE ** get_field(long requested, Func_ptr *assign) { bool in_middle = false; /* * if requesting whole line but some other field has been altered, * then the whole line must be rebuilt */ if (requested == 0) { if (! field0_valid) { /* first, parse remainder of input record */ if (NF == -1) { NF = (*parse_field)(UNLIMITED - 1, &parse_extent, fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr), save_FS, FS_regexp, set_field, (NODE *) NULL, (NODE *) NULL, in_middle); parse_high_water = NF; } rebuild_record(); } if (assign != NULL) *assign = reset_record; return &fields_arr[0]; } /* assert(requested > 0); */ #if 0 if (assign != NULL) field0_valid = false; /* $0 needs reconstruction */ #else /* * Keep things uniform. Also, mere intention of assigning something * to $n should not make $0 invalid. Makes sense to invalidate $0 * after the actual assignment is performed. Not a real issue in * the interpreter otherwise, but causes problem in the * debugger when watching or printing fields. */ if (assign != NULL) *assign = invalidate_field0; /* $0 needs reconstruction */ #endif if (requested <= parse_high_water) /* already parsed this field */ return &fields_arr[requested]; if (NF == -1) { /* have not yet parsed to end of record */ /* * parse up to requested fields, calling set_field() for each, * saving in parse_extent the point where the parse left off */ if (parse_high_water == 0) /* starting at the beginning */ parse_extent = fields_arr[0]->stptr; else in_middle = true; parse_high_water = (*parse_field)(requested, &parse_extent, fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr), save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle); /* * if we reached the end of the record, set NF to the number of * fields so far. Note that requested might actually refer to * a field that is beyond the end of the record, but we won't * set NF to that value at this point, since this is only a * reference to the field and NF only gets set if the field * is assigned to -- this case is handled below */ if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen) NF = parse_high_water; else if (parse_field == fpat_parse_field) { /* FPAT parsing is weird, isolate the special cases */ char *rec_start = fields_arr[0]->stptr; char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen; if ( parse_extent > rec_end || (parse_extent > rec_start && parse_extent < rec_end && requested == UNLIMITED-1)) NF = parse_high_water; else if (parse_extent == rec_start) /* could be no match for FPAT */ NF = 0; } if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */ requested = parse_high_water; } if (parse_high_water < requested) { /* requested beyond end of record */ if (assign != NULL) { /* expand record */ if (requested > nf_high_water) grow_fields_arr(requested); NF = requested; parse_high_water = requested; } else return &Null_field; } return &fields_arr[requested]; } /* set_element --- set an array element, used by do_split() */ static void set_element(long num, char *s, long len, NODE *n) { NODE *it; NODE **lhs; NODE *sub; it = make_string(s, len); it->flags |= USER_INPUT; sub = make_number((AWKNUM) (num)); lhs = assoc_lookup(n, sub); unref(*lhs); *lhs = it; if (n->astore != NULL) (*n->astore)(n, sub); unref(sub); } /* do_split --- implement split(), semantics are same as for field splitting */ NODE * do_split(int nargs) { NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL; char *s; long (*parseit)(long, char **, int, NODE *, Regexp *, Setfunc, NODE *, NODE *, bool); Regexp *rp = NULL; if (nargs == 4) { static bool warned = false; if (do_traditional || do_posix) { fatal(_("split: fourth argument is a gawk extension")); } sep_arr = POP_PARAM(); if (sep_arr->type != Node_var_array) fatal(_("split: fourth argument is not an array")); if ((do_lint || do_lint_old) && ! warned) { warned = true; lintwarn(_("split: fourth argument is a gawk extension")); } } sep = POP(); arr = POP_PARAM(); if (arr->type != Node_var_array) fatal(_("split: second argument is not an array")); if (sep_arr != NULL) { if (sep_arr == arr) fatal(_("split: cannot use the same array for second and fourth args")); /* This checks need to be done before clearing any of the arrays */ for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == arr) fatal(_("split: cannot use a subarray of second arg for fourth arg")); for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == sep_arr) fatal(_("split: cannot use a subarray of fourth arg for second arg")); assoc_clear(sep_arr); } assoc_clear(arr); src = TOP_STRING(); if (src->stlen == 0) { /* * Skip the work if first arg is the null string. */ tmp = POP_SCALAR(); DEREF(tmp); return make_number((AWKNUM) 0); } if ((sep->flags & REGEX) != 0) sep = sep->typed_re; if ( (sep->re_flags & FS_DFLT) != 0 && current_field_sep() == Using_FS && ! RS_is_null) { parseit = parse_field; fs = force_string(FS_node->var_value); rp = FS_regexp; } else { fs = sep->re_exp; if (fs->stlen == 0) { static bool warned = false; parseit = null_parse_field; if (do_lint && ! warned) { warned = true; lintwarn(_("split: null string for third arg is a gawk extension")); } } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) { if (fs->stptr[0] == ' ') { parseit = def_parse_field; } else parseit = sc_parse_field; } else { parseit = re_parse_field; rp = re_update(sep); } } s = src->stptr; tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen, fs, rp, set_element, arr, sep_arr, false)); src = POP_SCALAR(); /* really pop off stack */ DEREF(src); return tmp; } /* * do_patsplit --- implement patsplit(), semantics are same as for field * splitting with FPAT. */ NODE * do_patsplit(int nargs) { NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL; char *s; Regexp *rp = NULL; if (nargs == 4) { sep_arr = POP_PARAM(); if (sep_arr->type != Node_var_array) fatal(_("patsplit: fourth argument is not an array")); } sep = POP(); arr = POP_PARAM(); if (arr->type != Node_var_array) fatal(_("patsplit: second argument is not an array")); src = TOP_STRING(); if ((sep->flags & REGEX) != 0) sep = sep->typed_re; fpat = sep->re_exp; if (fpat->stlen == 0) fatal(_("patsplit: third argument must be non-null")); if (sep_arr != NULL) { if (sep_arr == arr) fatal(_("patsplit: cannot use the same array for second and fourth args")); /* These checks need to be done before clearing any of the arrays */ for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == arr) fatal(_("patsplit: cannot use a subarray of second arg for fourth arg")); for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array) if (tmp == sep_arr) fatal(_("patsplit: cannot use a subarray of fourth arg for second arg")); assoc_clear(sep_arr); } assoc_clear(arr); if (src->stlen == 0) { /* * Skip the work if first arg is the null string. */ tmp = make_number((AWKNUM) 0); } else { rp = re_update(sep); s = src->stptr; tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s, (int) src->stlen, fpat, rp, set_element, arr, sep_arr, false)); } src = POP_SCALAR(); /* really pop off stack */ DEREF(src); return tmp; } /* set_parser --- update the current (non-API) parser */ static void set_parser(parse_field_func_t func) { normal_parse_field = func; if (! api_parser_override && parse_field != func) { parse_field = func; update_PROCINFO_str("FS", current_field_sep_str()); } } /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */ void set_FIELDWIDTHS() { char *scan; char *end; int i; static int fw_alloc = 4; static bool warned = false; bool fatal_error = false; NODE *tmp; if (do_lint && ! warned) { warned = true; lintwarn(_("`FIELDWIDTHS' is a gawk extension")); } if (do_traditional) /* quick and dirty, does the trick */ return; /* * If changing the way fields are split, obey least-surprise * semantics, and force $0 to be split totally. */ if (fields_arr != NULL) (void) get_field(UNLIMITED - 1, 0); set_parser(fw_parse_field); tmp = force_string(FIELDWIDTHS_node->var_value); scan = tmp->stptr; if (FIELDWIDTHS == NULL) { emalloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS"); FIELDWIDTHS->use_chars = awk_true; } FIELDWIDTHS->nf = 0; for (i = 0; ; i++) { unsigned long int tmp; if (i >= fw_alloc) { fw_alloc *= 2; erealloc(FIELDWIDTHS, awk_fieldwidth_info_t *, awk_fieldwidth_info_size(fw_alloc), "set_FIELDWIDTHS"); } /* Ensure that there is no leading `-' sign. Otherwise, strtoul would accept it and return a bogus result. */ while (is_blank(*scan)) { ++scan; } if (*scan == '-') { fatal_error = true; break; } if (*scan == '\0') break; // Look for skip value. We allow N:M and N:*. /* * Detect an invalid base-10 integer, a valid value that * is followed by something other than a blank or '\0', * or a value that is not in the range [1..UINT_MAX]. */ errno = 0; tmp = strtoul(scan, &end, 10); if (errno == 0 && *end == ':' && (0 < tmp && tmp <= UINT_MAX)) { FIELDWIDTHS->fields[i].skip = tmp; scan = end + 1; if (*scan == '-' || is_blank(*scan)) { fatal_error = true; break; } // try scanning for field width tmp = strtoul(scan, &end, 10); } else FIELDWIDTHS->fields[i].skip = 0; if (errno != 0 || (*end != '\0' && ! is_blank(*end)) || !(0 < tmp && tmp <= UINT_MAX) ) { if (*scan == '*') { for (scan++; is_blank(*scan); scan++) continue; if (*scan != '\0') fatal(_("`*' must be the last designator in FIELDWIDTHS")); FIELDWIDTHS->fields[i].len = UINT_MAX; FIELDWIDTHS->nf = i+1; } else fatal_error = true; break; } FIELDWIDTHS->fields[i].len = tmp; FIELDWIDTHS->nf = i+1; scan = end; /* Skip past any trailing blanks. */ while (is_blank(*scan)) { ++scan; } if (*scan == '\0') break; } if (fatal_error) fatal(_("invalid FIELDWIDTHS value, for field %d, near `%s'"), i + 1, scan); } /* set_FS --- handle things when FS is assigned to */ void set_FS() { char buf[10]; NODE *fs; static NODE *save_fs = NULL; static NODE *save_rs = NULL; bool remake_re = true; /* * If changing the way fields are split, obey least-surprise * semantics, and force $0 to be split totally. */ if (fields_arr != NULL) (void) get_field(UNLIMITED - 1, 0); /* It's possible that only IGNORECASE changed, or FS = FS */ /* * This comparison can't use cmp_nodes(), which pays attention * to IGNORECASE, and that's not what we want. */ if (save_fs && FS_node->var_value->stlen == save_fs->stlen && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0 && save_rs && RS_node->var_value->stlen == save_rs->stlen && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) { if (FS_regexp != NULL) FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case); /* FS = FS */ if (current_field_sep() == Using_FS) { return; } else { remake_re = false; goto choose_fs_function; } } unref(save_fs); save_fs = dupnode(FS_node->var_value); unref(save_rs); save_rs = dupnode(RS_node->var_value); resave_fs = true; /* If FS_re_no_case assignment is fatal (make_regexp in remake_re) * FS_regexp will be NULL with a non-null FS_re_yes_case. * refree() handles null argument; no need for `if (FS_regexp != NULL)' below. * Please do not remerge. */ refree(FS_re_yes_case); refree(FS_re_no_case); FS_re_yes_case = FS_re_no_case = FS_regexp = NULL; choose_fs_function: buf[0] = '\0'; default_FS = false; fs = force_string(FS_node->var_value); if (! do_traditional && fs->stlen == 0) { static bool warned = false; set_parser(null_parse_field); if (do_lint && ! warned) { warned = true; lintwarn(_("null string for `FS' is a gawk extension")); } } else if (fs->stlen > 1) { if (do_lint_old) warning(_("old awk does not support regexps as value of `FS'")); set_parser(re_parse_field); } else if (RS_is_null) { /* we know that fs->stlen <= 1 */ set_parser(sc_parse_field); if (fs->stlen == 1) { if (fs->stptr[0] == ' ') { default_FS = true; strcpy(buf, "[ \t\n]+"); } else if (fs->stptr[0] == '\\') { /* yet another special case */ strcpy(buf, "[\\\\\n]"); } else if (fs->stptr[0] == '\0') { /* and yet another special case */ strcpy(buf, "[\\000\n]"); } else if (fs->stptr[0] != '\n') { sprintf(buf, "[%c\n]", fs->stptr[0]); } } } else { set_parser(def_parse_field); if (fs->stlen == 1) { if (fs->stptr[0] == ' ') default_FS = true; else if (fs->stptr[0] == '\\') /* same special case */ strcpy(buf, "[\\\\]"); else set_parser(sc_parse_field); } } if (remake_re) { refree(FS_re_yes_case); refree(FS_re_no_case); FS_re_yes_case = FS_re_no_case = FS_regexp = NULL; if (buf[0] != '\0') { FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true); FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true); FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case); set_parser(re_parse_field); } else if (parse_field == re_parse_field) { FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true); FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true); FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case); } else FS_re_yes_case = FS_re_no_case = FS_regexp = NULL; } /* * For FS = "c", we don't use IGNORECASE. But we must use * re_parse_field to get the character and the newline as * field separators. */ if (fs->stlen == 1 && parse_field == re_parse_field) FS_regexp = FS_re_yes_case; } /* current_field_sep --- return the field separator type */ field_sep_type current_field_sep() { if (api_parser_override) return Using_API; else if (parse_field == fw_parse_field) return Using_FIELDWIDTHS; else if (parse_field == fpat_parse_field) return Using_FPAT; else return Using_FS; } /* current_field_sep_str --- return the field separator type as a string */ const char * current_field_sep_str() { if (api_parser_override) return "API"; else if (parse_field == fw_parse_field) return "FIELDWIDTHS"; else if (parse_field == fpat_parse_field) return "FPAT"; else return "FS"; } /* update_PROCINFO_str --- update PROCINFO[sub] with string value */ void update_PROCINFO_str(const char *subscript, const char *str) { NODE **aptr; NODE *tmp; if (PROCINFO_node == NULL) return; tmp = make_string(subscript, strlen(subscript)); aptr = assoc_lookup(PROCINFO_node, tmp); unref(tmp); unref(*aptr); *aptr = make_string(str, strlen(str)); } /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */ void update_PROCINFO_num(const char *subscript, AWKNUM val) { NODE **aptr; NODE *tmp; if (PROCINFO_node == NULL) return; tmp = make_string(subscript, strlen(subscript)); aptr = assoc_lookup(PROCINFO_node, tmp); unref(tmp); unref(*aptr); *aptr = make_number(val); } /* set_FPAT --- handle an assignment to FPAT */ void set_FPAT() { static bool warned = false; static NODE *save_fpat = NULL; bool remake_re = true; NODE *fpat; if (do_lint && ! warned) { warned = true; lintwarn(_("`FPAT' is a gawk extension")); } if (do_traditional) /* quick and dirty, does the trick */ return; /* * If changing the way fields are split, obey least-suprise * semantics, and force $0 to be split totally. */ if (fields_arr != NULL) (void) get_field(UNLIMITED - 1, 0); /* It's possible that only IGNORECASE changed, or FPAT = FPAT */ /* * This comparison can't use cmp_nodes(), which pays attention * to IGNORECASE, and that's not what we want. */ if (save_fpat && FPAT_node->var_value->stlen == save_fpat->stlen && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) { if (FPAT_regexp != NULL) FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case); /* FPAT = FPAT */ if (current_field_sep() == Using_FPAT) { return; } else { remake_re = false; goto set_fpat_function; } } unref(save_fpat); save_fpat = dupnode(FPAT_node->var_value); refree(FPAT_re_yes_case); refree(FPAT_re_no_case); FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL; set_fpat_function: fpat = force_string(FPAT_node->var_value); set_parser(fpat_parse_field); if (remake_re) { refree(FPAT_re_yes_case); refree(FPAT_re_no_case); FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL; FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true); FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true); FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case); } } /* * increment_scan --- macro to move scan pointer ahead by one character. * Implementation varies if doing MBS or not. */ #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs) /* incr_scan --- MBS version of increment_scan() */ static void incr_scan(char **scanp, size_t len, mbstate_t *mbs) { size_t mbclen = 0; if (gawk_mb_cur_max > 1) { mbclen = mbrlen(*scanp, len, mbs); if ( (mbclen == 1) || (mbclen == (size_t) -1) || (mbclen == (size_t) -2) || (mbclen == 0)) { /* We treat it as a singlebyte character. */ mbclen = 1; } *scanp += mbclen; } else (*scanp)++; } /* * fpat_parse_field --- parse fields using a regexp. * * This is called both from get_field() and from do_patsplit() * via (*parse_field)(). This variation is for when FPAT is a regular * expression -- use the value to find field contents. * * The FPAT parsing logic is a bit difficult to specify. In particular * to allow null fields at certain locations. To make the code as robust * as possible, an awk reference implementation was written and tested * as a first step, and later recoded in C, preserving its structure as * much as possible. * * # Reference implementation of the FPAT record parsing. * # * # Each loop iteration identifies a (separator[n-1],field[n]) pair. * # Each loop iteration must consume some characters, except for the first field. * # So a null field is only valid as a first field or after a non-null separator. * # A null record has no fields (not a single null field). * * function refpatsplit(string, fields, pattern, seps, * parse_start, sep_start, field_start, field_length, field_found, nf) # locals * { * # Local state variables: * # - parse_start: pointer to the first not yet consumed character * # - sep_start: pointer to the beginning of the parsed separator * # - field start: pointer to the beginning of the parsed field * # - field length: length of the parsed field * # - field_found: flag for succesful field match * # - nf: Number of fields found so far * * # Prepare for parsing * parse_start = 1 # first not yet parsed char * nf = 0 # fields found so far * delete fields * delete seps * * # Loop that consumes the whole record * while (parse_start <= length(string)) { # still something to parse * * # first attempt to match the next field * sep_start = parse_start * field_found = match(substr(string, parse_start), pattern) * * # check for an invalid null field and retry one character away * if (nf > 0 && field_found && RSTART==1 && RLENGTH==0) { * parse_start++ * field_found = match(substr(string, parse_start), pattern) * } * * # store the (sep[n-1],field[n]) pair * if (field_found) { * field_start = parse_start + RSTART - 1 * field_length = RLENGTH * seps[nf] = substr(string, sep_start, field_start-sep_start) * fields[++nf] = substr(string, field_start, field_length) * parse_start = field_start + field_length * * # store the final extra sep after the last field * } else { * seps[nf] = substr(string, sep_start) * parse_start = length(string) + 1 * } * } * * return nf * } */ static long fpat_parse_field(long up_to, /* parse only up to this field number */ char **buf, /* on input: string to parse; on output: point to start next */ int len, NODE *fs ATTRIBUTE_UNUSED, Regexp *rp, Setfunc set, /* routine to set the value of the parsed field */ NODE *n, NODE *sep_arr, /* array of field separators (may be NULL) */ bool in_middle) { char *scan = *buf; long nf = parse_high_water; char *start; char *end = scan + len; int regex_flags = RE_NEED_START; mbstate_t mbs; char* field_start; bool field_found = false; memset(&mbs, 0, sizeof(mbstate_t)); if (up_to == UNLIMITED) nf = 0; if (len == 0) return nf; if (rp == NULL) /* use FPAT */ rp = FPAT_regexp; while (scan < end && nf < up_to) { /* still something to parse */ /* first attempt to match the next field */ start = scan; field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1; /* check for an invalid null field and retry one character away */ if (nf > 0 && field_found && REEND(rp, scan) == 0) { /* invalid null field */ increment_scan(& scan, end - scan); field_found = research(rp, scan, 0, (end - scan), regex_flags) != -1; } /* store the (sep[n-1],field[n]) pair */ if (field_found) { field_start = scan + RESTART(rp, scan); if (sep_arr != NULL) { /* store the separator */ if (field_start == start) /* match at front */ set_element(nf, start, 0L, sep_arr); else set_element(nf, start, (long) (field_start - start), sep_arr); } /* field is text that matched */ (*set)(++nf, field_start, (long)(REEND(rp, scan) - RESTART(rp, scan)), n); scan += REEND(rp, scan); } else { /* * No match, store the final extra separator after * the last field. */ if (sep_arr != NULL) set_element(nf, start, (long) (end - start), sep_arr); scan = end; } } /* * If the last field extends up to the end of the record, generate * a null trailing separator */ if (sep_arr != NULL && scan == end && field_found) set_element(nf, scan, 0L, sep_arr); *buf = scan; return nf; }