/* * liblognorm - a fast samples-based log normalization library * Copyright 2010-2018 by Rainer Gerhards and Adiscon GmbH. * * Modified by Pavel Levshin (pavel@levshin.spb.ru) in 2013 * * This file is part of liblognorm. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * A copy of the LGPL v2.1 can be found in the file "COPYING" in this distribution. */ #include "config.h" #include #include #include #include #include #include #include #include #include "v1_liblognorm.h" #include "internal.h" #include "lognorm.h" #include "v1_parser.h" #include "v1_samp.h" #ifdef FEATURE_REGEXP #include #include #endif /* some helpers */ static inline int hParseInt(const unsigned char **buf, size_t *lenBuf) { const unsigned char *p = *buf; size_t len = *lenBuf; int i = 0; while(len > 0 && isdigit(*p)) { i = i * 10 + *p - '0'; ++p; --len; } *buf = p; *lenBuf = len; return i; } /* parsers for the primitive types * * All parsers receive * * @param[in] str the to-be-parsed string * @param[in] strLen length of the to-be-parsed string * @param[in] offs an offset into the string * @param[in] node fieldlist with additional data; for simple * parsers, this sets variable "ed", which just is * string data. * @param[out] parsed bytes * @param[out] value ptr to json object containing parsed data * (can be unused, but if used *value MUST be NULL on entry) * * They will try to parse out "their" object from the string. If they * succeed, they: * * return 0 on success and LN_WRONGPARSER if this parser could * not successfully parse (but all went well otherwise) and something * else in case of an error. */ #define PARSER(ParserName) \ int ln_parse##ParserName(const char *const str, const size_t strLen, \ size_t *const offs, \ __attribute__((unused)) const ln_fieldList_t *node, \ size_t *parsed, \ __attribute__((unused)) struct json_object **value) \ { \ int r = LN_WRONGPARSER; \ __attribute__((unused)) es_str_t *ed = node->data; \ *parsed = 0; #define FAILParser \ goto parserdone; /* suppress warnings */ \ parserdone: \ r = 0; \ goto done; /* suppress warnings */ \ done: #define ENDFailParser \ return r; \ } /** * Utilities to allow constructors of complex parser's to * easily process field-declaration arguments. */ #define FIELD_ARG_SEPERATOR ":" #define MAX_FIELD_ARGS 10 struct pcons_args_s { int argc; char *argv[MAX_FIELD_ARGS]; }; typedef struct pcons_args_s pcons_args_t; static void free_pcons_args(pcons_args_t** dat_p) { pcons_args_t *dat = *dat_p; *dat_p = NULL; if (! dat) { return; } while((--(dat->argc)) >= 0) { if (dat->argv[dat->argc] != NULL) free(dat->argv[dat->argc]); } free(dat); } static pcons_args_t* pcons_args(es_str_t *args, int expected_argc) { pcons_args_t *dat = NULL; char* orig_str = NULL; if ((dat = malloc(sizeof(pcons_args_t))) == NULL) goto fail; dat->argc = 0; if (args != NULL) { orig_str = es_str2cstr(args, NULL); char *str = orig_str; while (dat->argc < MAX_FIELD_ARGS) { int i = dat->argc++; char *next = (dat->argc == expected_argc) ? NULL : strstr(str, FIELD_ARG_SEPERATOR); if (next == NULL) { if ((dat->argv[i] = strdup(str)) == NULL) goto fail; break; } else { if ((dat->argv[i] = strndup(str, next - str)) == NULL) goto fail; next++; } str = next; } } goto done; fail: if (dat != NULL) free_pcons_args(&dat); done: if (orig_str != NULL) free(orig_str); return dat; } static const char* pcons_arg(pcons_args_t *dat, int i, const char* dflt_val) { if (i >= dat->argc) return dflt_val; return dat->argv[i]; } static char* pcons_arg_copy(pcons_args_t *dat, int i, const char* dflt_val) { const char *str = pcons_arg(dat, i, dflt_val); return (str == NULL) ? NULL : strdup(str); } static void pcons_unescape_arg(pcons_args_t *dat, int i) { char *arg = (char*) pcons_arg(dat, i, NULL); es_str_t *str = NULL; if (arg != NULL) { str = es_newStrFromCStr(arg, strlen(arg)); if (str != NULL) { es_unescapeStr(str); free(arg); dat->argv[i] = es_str2cstr(str, NULL); es_deleteStr(str); } } } /** * Parse a TIMESTAMP as specified in RFC5424 (subset of RFC3339). */ PARSER(RFC5424Date) const unsigned char *pszTS; /* variables to temporarily hold time information while we parse */ __attribute__((unused)) int year; int month; int day; int hour; /* 24 hour clock */ int minute; int second; __attribute__((unused)) int secfrac; /* fractional seconds (must be 32 bit!) */ __attribute__((unused)) int secfracPrecision; int OffsetHour; /* UTC offset in hours */ int OffsetMinute; /* UTC offset in minutes */ size_t len; size_t orglen; /* end variables to temporarily hold time information while we parse */ pszTS = (unsigned char*) str + *offs; len = orglen = strLen - *offs; year = hParseInt(&pszTS, &len); /* We take the liberty to accept slightly malformed timestamps e.g. in * the format of 2003-9-1T1:0:0. */ if(len == 0 || *pszTS++ != '-') goto done; --len; month = hParseInt(&pszTS, &len); if(month < 1 || month > 12) goto done; if(len == 0 || *pszTS++ != '-') goto done; --len; day = hParseInt(&pszTS, &len); if(day < 1 || day > 31) goto done; if(len == 0 || *pszTS++ != 'T') goto done; --len; hour = hParseInt(&pszTS, &len); if(hour < 0 || hour > 23) goto done; if(len == 0 || *pszTS++ != ':') goto done; --len; minute = hParseInt(&pszTS, &len); if(minute < 0 || minute > 59) goto done; if(len == 0 || *pszTS++ != ':') goto done; --len; second = hParseInt(&pszTS, &len); if(second < 0 || second > 60) goto done; /* Now let's see if we have secfrac */ if(len > 0 && *pszTS == '.') { --len; const unsigned char *pszStart = ++pszTS; secfrac = hParseInt(&pszTS, &len); secfracPrecision = (int) (pszTS - pszStart); } else { secfracPrecision = 0; secfrac = 0; } /* check the timezone */ if(len == 0) goto done; if(*pszTS == 'Z') { --len; pszTS++; /* eat Z */ } else if((*pszTS == '+') || (*pszTS == '-')) { --len; pszTS++; OffsetHour = hParseInt(&pszTS, &len); if(OffsetHour < 0 || OffsetHour > 23) goto done; if(len == 0 || *pszTS++ != ':') goto done; --len; OffsetMinute = hParseInt(&pszTS, &len); if(OffsetMinute < 0 || OffsetMinute > 59) goto done; } else { /* there MUST be TZ information */ goto done; } if(len > 0) { if(*pszTS != ' ') /* if it is not a space, it can not be a "good" time */ goto done; } /* we had success, so update parse pointer */ *parsed = orglen - len; r = 0; /* success */ done: return r; } /** * Parse a RFC3164 Date. */ PARSER(RFC3164Date) const unsigned char *p; size_t len, orglen; /* variables to temporarily hold time information while we parse */ __attribute__((unused)) int month; int day; #if 0 /* TODO: why does this still exist? */ int year = 0; /* 0 means no year provided */ #endif int hour; /* 24 hour clock */ int minute; int second; p = (unsigned char*) str + *offs; orglen = len = strLen - *offs; /* If we look at the month (Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec), * we may see the following character sequences occur: * * J(an/u(n/l)), Feb, Ma(r/y), A(pr/ug), Sep, Oct, Nov, Dec * * We will use this for parsing, as it probably is the * fastest way to parse it. */ if(len < 3) goto done; switch(*p++) { case 'j': case 'J': if(*p == 'a' || *p == 'A') { ++p; if(*p == 'n' || *p == 'N') { ++p; month = 1; } else goto done; } else if(*p == 'u' || *p == 'U') { ++p; if(*p == 'n' || *p == 'N') { ++p; month = 6; } else if(*p == 'l' || *p == 'L') { ++p; month = 7; } else goto done; } else goto done; break; case 'f': case 'F': if(*p == 'e' || *p == 'E') { ++p; if(*p == 'b' || *p == 'B') { ++p; month = 2; } else goto done; } else goto done; break; case 'm': case 'M': if(*p == 'a' || *p == 'A') { ++p; if(*p == 'r' || *p == 'R') { ++p; month = 3; } else if(*p == 'y' || *p == 'Y') { ++p; month = 5; } else goto done; } else goto done; break; case 'a': case 'A': if(*p == 'p' || *p == 'P') { ++p; if(*p == 'r' || *p == 'R') { ++p; month = 4; } else goto done; } else if(*p == 'u' || *p == 'U') { ++p; if(*p == 'g' || *p == 'G') { ++p; month = 8; } else goto done; } else goto done; break; case 's': case 'S': if(*p == 'e' || *p == 'E') { ++p; if(*p == 'p' || *p == 'P') { ++p; month = 9; } else goto done; } else goto done; break; case 'o': case 'O': if(*p == 'c' || *p == 'C') { ++p; if(*p == 't' || *p == 'T') { ++p; month = 10; } else goto done; } else goto done; break; case 'n': case 'N': if(*p == 'o' || *p == 'O') { ++p; if(*p == 'v' || *p == 'V') { ++p; month = 11; } else goto done; } else goto done; break; case 'd': case 'D': if(*p == 'e' || *p == 'E') { ++p; if(*p == 'c' || *p == 'C') { ++p; month = 12; } else goto done; } else goto done; break; default: goto done; } len -= 3; /* done month */ if(len == 0 || *p++ != ' ') goto done; --len; /* we accept a slightly malformed timestamp with one-digit days. */ if(*p == ' ') { --len; ++p; } day = hParseInt(&p, &len); if(day < 1 || day > 31) goto done; if(len == 0 || *p++ != ' ') goto done; --len; /* time part */ hour = hParseInt(&p, &len); if(hour > 1970 && hour < 2100) { /* if so, we assume this actually is a year. This is a format found * e.g. in Cisco devices. * year = hour; */ /* re-query the hour, this time it must be valid */ if(len == 0 || *p++ != ' ') goto done; --len; hour = hParseInt(&p, &len); } if(hour < 0 || hour > 23) goto done; if(len == 0 || *p++ != ':') goto done; --len; minute = hParseInt(&p, &len); if(minute < 0 || minute > 59) goto done; if(len == 0 || *p++ != ':') goto done; --len; second = hParseInt(&p, &len); if(second < 0 || second > 60) goto done; /* we provide support for an extra ":" after the date. While this is an * invalid format, it occurs frequently enough (e.g. with Cisco devices) * to permit it as a valid case. -- rgerhards, 2008-09-12 */ if(len > 0 && *p == ':') { ++p; /* just skip past it */ --len; } /* we had success, so update parse pointer */ *parsed = orglen - len; r = 0; /* success */ done: return r; } /** * Parse a Number. * Note that a number is an abstracted concept. We always represent it * as 64 bits (but may later change our mind if performance dictates so). */ PARSER(Number) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; for (i = *offs; i < strLen && isdigit(c[i]); i++); if (i == *offs) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse a Real-number in floating-pt form. */ PARSER(Float) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; int seen_point = 0; i = *offs; if (c[i] == '-') i++; for (; i < strLen; i++) { if (c[i] == '.') { if (seen_point != 0) break; seen_point = 1; } else if (! isdigit(c[i])) { break; } } if (i == *offs) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse a hex Number. * A hex number begins with 0x and contains only hex digits until the terminating * whitespace. Note that if a non-hex character is deteced inside the number string, * this is NOT considered to be a number. */ PARSER(HexNumber) const char *c; size_t i = *offs; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; if(c[i] != '0' || c[i+1] != 'x') goto done; for (i += 2 ; i < strLen && isxdigit(c[i]); i++); if (i == *offs || !isspace(c[i])) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse a kernel timestamp. * This is a fixed format, see * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/kernel/printk/printk.c?id=refs/tags/v4.0#n1011 * This is the code that generates it: * sprintf(buf, "[%5lu.%06lu] ", (unsigned long)ts, rem_nsec / 1000); * We accept up to 12 digits for ts, everything above that for sure is * no timestamp. */ #define LEN_KERNEL_TIMESTAMP 14 PARSER(KernelTimestamp) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(c[i] != '[' || i+LEN_KERNEL_TIMESTAMP > strLen || !isdigit(c[i+1]) || !isdigit(c[i+2]) || !isdigit(c[i+3]) || !isdigit(c[i+4]) || !isdigit(c[i+5]) ) goto done; i += 6; for(int j = 0 ; j < 7 && i < strLen && isdigit(c[i]) ; ) ++i, ++j; /* just scan */ if(i >= strLen || c[i] != '.') goto done; ++i; /* skip over '.' */ if( i+7 > strLen || !isdigit(c[i+0]) || !isdigit(c[i+1]) || !isdigit(c[i+2]) || !isdigit(c[i+3]) || !isdigit(c[i+4]) || !isdigit(c[i+5]) || c[i+6] != ']' ) goto done; i += 7; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse whitespace. * This parses all whitespace until the first non-whitespace character * is found. This is primarily a tool to skip to the next "word" if * the exact number of whitspace characters (and type of whitespace) * is not known. The current parsing position MUST be on a whitspace, * else the parser does not match. * This parser is also a forward-compatibility tool for the upcoming * slsa (simple log structure analyser) tool. */ PARSER(Whitespace) const char *c; size_t i = *offs; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; if(!isspace(c[i])) goto done; for (i++ ; i < strLen && isspace(c[i]); i++); /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse a word. * A word is a SP-delimited entity. The parser always works, except if * the offset is position on a space upon entry. */ PARSER(Word) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; /* search end of word */ while(i < strLen && c[i] != ' ') i++; if(i == *offs) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse everything up to a specific string. * swisskid, 2015-01-21 */ PARSER(StringTo) const char *c; char *toFind = NULL; size_t i, j, k, m; int chkstr; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); assert(ed != NULL); k = es_strlen(ed) - 1; toFind = es_str2cstr(ed, NULL); c = str; i = *offs; chkstr = 0; /* Total hunt for letter */ while(chkstr == 0 && i < strLen ) { i++; if(c[i] == toFind[0]) { /* Found the first letter, now find the rest of the string */ j = 0; m = i; while(m < strLen && j < k ) { m++; j++; if(c[m] != toFind[j]) break; if (j == k) chkstr = 1; } } } if(i == *offs || i == strLen || c[i] != toFind[0]) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: if(toFind != NULL) free(toFind); return r; } /** * Parse a alphabetic word. * A alpha word is composed of characters for which isalpha returns true. * The parser dones if there is no alpha character at all. */ PARSER(Alpha) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; /* search end of word */ while(i < strLen && isalpha(c[i])) i++; if(i == *offs) { goto done; } /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse everything up to a specific character. * The character must be the only char inside extra data passed to the parser. * It is a program error if strlen(ed) != 1. It is considered a format error if * a) the to-be-parsed buffer is already positioned on the terminator character * b) there is no terminator until the end of the buffer * In those cases, the parsers declares itself as not being successful, in all * other cases a string is extracted. */ PARSER(CharTo) const char *c; unsigned char cTerm; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); assert(es_strlen(ed) == 1); cTerm = *(es_getBufAddr(ed)); c = str; i = *offs; /* search end of word */ while(i < strLen && c[i] != cTerm) i++; if(i == *offs || i == strLen || c[i] != cTerm) goto done; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /** * Parse everything up to a specific character, or up to the end of string. * The character must be the only char inside extra data passed to the parser. * It is a program error if strlen(ed) != 1. * This parser always returns success. * By nature of the parser, it is required that end of string or the separator * follows this field in rule. */ PARSER(CharSeparated) const char *c; unsigned char cTerm; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); assert(es_strlen(ed) == 1); cTerm = *(es_getBufAddr(ed)); c = str; i = *offs; /* search end of word */ while(i < strLen && c[i] != cTerm) i++; /* success, persist */ *parsed = i - *offs; r = 0; /* success */ return r; } /** * Parse yet-to-be-matched portion of string by re-applying * top-level rules again. */ #define DEFAULT_REMAINING_FIELD_NAME "tail" struct recursive_parser_data_s { ln_ctx ctx; char* remaining_field; int free_ctx; }; PARSER(Recursive) assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); struct recursive_parser_data_s* pData = (struct recursive_parser_data_s*) node->parser_data; if (pData != NULL) { int remaining_len = strLen - *offs; const char *remaining_str = str + *offs; json_object *unparsed = NULL; CHKN(*value = json_object_new_object()); ln_normalize(pData->ctx, remaining_str, remaining_len, value); if (json_object_object_get_ex(*value, UNPARSED_DATA_KEY, &unparsed)) { json_object_put(*value); *value = NULL; *parsed = 0; } else if (pData->remaining_field != NULL && json_object_object_get_ex(*value, pData->remaining_field, &unparsed)) { *parsed = strLen - *offs - json_object_get_string_len(unparsed); json_object_object_del(*value, pData->remaining_field); } else { *parsed = strLen - *offs; } } r = 0; /* success */ done: return r; } typedef ln_ctx (ctx_constructor)(ln_ctx, pcons_args_t*, const char*); static void* _recursive_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx, int no_of_args, int remaining_field_arg_idx, int free_ctx, ctx_constructor *fn) { int r = LN_BADCONFIG; char* name = NULL; struct recursive_parser_data_s *pData = NULL; pcons_args_t *args = NULL; CHKN(name = es_str2cstr(node->name, NULL)); CHKN(pData = calloc(1, sizeof(struct recursive_parser_data_s))); pData->free_ctx = free_ctx; pData->remaining_field = NULL; CHKN(args = pcons_args(node->raw_data, no_of_args)); CHKN(pData->ctx = fn(ctx, args, name)); CHKN(pData->remaining_field = pcons_arg_copy(args, remaining_field_arg_idx, DEFAULT_REMAINING_FIELD_NAME)); r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for recursive/descent field name"); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for parser-data for field: %s", name); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (pData->ctx == NULL) ln_dbgprintf(ctx, "recursive/descent normalizer context creation " "doneed for field: %s", name); else if (pData->remaining_field == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for remaining-field name for " "recursive/descent field: %s", name); recursive_parser_data_destructor((void**) &pData); } free(name); free_pcons_args(&args); return pData; } static ln_ctx identity_recursive_parse_ctx_constructor(ln_ctx parent_ctx, __attribute__((unused)) pcons_args_t* args, __attribute__((unused)) const char* field_name) { return parent_ctx; } void* recursive_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { return _recursive_parser_data_constructor(node, ctx, 1, 0, 0, identity_recursive_parse_ctx_constructor); } static ln_ctx child_recursive_parse_ctx_constructor(ln_ctx parent_ctx, pcons_args_t* args, const char* field_name) { int r = LN_BADCONFIG; const char* rb = NULL; ln_ctx ctx = NULL; pcons_unescape_arg(args, 0); CHKN(rb = pcons_arg(args, 0, NULL)); CHKN(ctx = ln_v1_inherittedCtx(parent_ctx)); CHKR(ln_v1_loadSamples(ctx, rb)); done: if (r != 0) { if (rb == NULL) ln_dbgprintf(parent_ctx, "file-name for descent rulebase not provided for field: %s", field_name); else if (ctx == NULL) ln_dbgprintf(parent_ctx, "couldn't allocate memory to create descent-field normalizer " "context for field: %s", field_name); else ln_dbgprintf(parent_ctx, "couldn't load samples into descent context for field: %s", field_name); if (ctx != NULL) ln_exitCtx(ctx); ctx = NULL; } return ctx; } void* descent_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { return _recursive_parser_data_constructor(node, ctx, 2, 1, 1, child_recursive_parse_ctx_constructor); } void recursive_parser_data_destructor(void** dataPtr) { if (*dataPtr != NULL) { struct recursive_parser_data_s *pData = (struct recursive_parser_data_s*) *dataPtr; if (pData->free_ctx && pData->ctx != NULL) { ln_exitCtx(pData->ctx); pData->ctx = NULL; } if (pData->remaining_field != NULL) free(pData->remaining_field); free(pData); *dataPtr = NULL; } }; /** * Parse string tokenized by given char-sequence * The sequence may appear 0 or more times, but zero times means 1 token. * NOTE: its not 0 tokens, but 1 token. * * The token found is parsed according to the field-type provided after * tokenizer char-seq. */ #define DEFAULT_MATCHED_FIELD_NAME "default" struct tokenized_parser_data_s { es_str_t *tok_str; ln_ctx ctx; char *remaining_field; int use_default_field; int free_ctx; }; typedef struct tokenized_parser_data_s tokenized_parser_data_t; PARSER(Tokenized) assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); tokenized_parser_data_t *pData = (tokenized_parser_data_t*) node->parser_data; if (pData != NULL ) { json_object *json_p = NULL; if (pData->use_default_field) CHKN(json_p = json_object_new_object()); json_object *matches = NULL; CHKN(matches = json_object_new_array()); int remaining_len = strLen - *offs; const char *remaining_str = str + *offs; json_object *remaining = NULL; json_object *match = NULL; while (remaining_len > 0) { if (! pData->use_default_field) { json_object_put(json_p); json_p = json_object_new_object(); } /*TODO: handle null condition gracefully*/ ln_normalize(pData->ctx, remaining_str, remaining_len, &json_p); if (remaining) json_object_put(remaining); if (pData->use_default_field && json_object_object_get_ex(json_p, DEFAULT_MATCHED_FIELD_NAME, &match)) { json_object_array_add(matches, json_object_get(match)); } else if (! (pData->use_default_field || json_object_object_get_ex(json_p, UNPARSED_DATA_KEY, &match))) { json_object_array_add(matches, json_object_get(json_p)); } else { if (json_object_array_length(matches) > 0) { remaining_len += es_strlen(pData->tok_str); break; } else { json_object_put(json_p); json_object_put(matches); FAIL(LN_WRONGPARSER); } } if (json_object_object_get_ex(json_p, pData->remaining_field, &remaining)) { remaining_len = json_object_get_string_len(remaining); if (remaining_len > 0) { remaining_str = json_object_get_string(json_object_get(remaining)); json_object_object_del(json_p, pData->remaining_field); if (es_strbufcmp(pData->tok_str, (const unsigned char *)remaining_str, es_strlen(pData->tok_str))) { json_object_put(remaining); break; } else { remaining_str += es_strlen(pData->tok_str); remaining_len -= es_strlen(pData->tok_str); } } } else { remaining_len = 0; break; } if (pData->use_default_field) json_object_object_del(json_p, DEFAULT_MATCHED_FIELD_NAME); } json_object_put(json_p); /* success, persist */ *parsed = (strLen - *offs) - remaining_len; *value = matches; } else { FAIL(LN_BADPARSERSTATE); } r = 0; /* success */ done: return r; } void tokenized_parser_data_destructor(void** dataPtr) { tokenized_parser_data_t *data = (tokenized_parser_data_t*) *dataPtr; if (data->tok_str != NULL) es_deleteStr(data->tok_str); if (data->free_ctx && (data->ctx != NULL)) ln_exitCtx(data->ctx); if (data->remaining_field != NULL) free(data->remaining_field); free(data); *dataPtr = NULL; } static void load_generated_parser_samples(ln_ctx ctx, const char* const field_descr, const int field_descr_len, const char* const suffix, const int length) { static const char* const RULE_PREFIX = "rule=:%"DEFAULT_MATCHED_FIELD_NAME":";/*TODO: extract nice constants*/ static const int RULE_PREFIX_LEN = 15; char *sample_str = NULL; es_str_t *field_decl = es_newStrFromCStr(RULE_PREFIX, RULE_PREFIX_LEN); if (! field_decl) goto free; if (es_addBuf(&field_decl, field_descr, field_descr_len) || es_addBuf(&field_decl, "%", 1) || es_addBuf(&field_decl, suffix, length)) { ln_dbgprintf(ctx, "couldn't prepare field for tokenized field-picking: '%s'", field_descr); goto free; } sample_str = es_str2cstr(field_decl, NULL); if (! sample_str) { ln_dbgprintf(ctx, "couldn't prepare sample-string for: '%s'", field_descr); goto free; } ln_v1_loadSample(ctx, sample_str); free: if (sample_str) free(sample_str); if (field_decl) es_deleteStr(field_decl); } static ln_ctx generate_context_with_field_as_prefix(ln_ctx parent, const char* field_descr, int field_descr_len) { int r = LN_BADCONFIG; const char* remaining_field = "%"DEFAULT_REMAINING_FIELD_NAME":rest%"; ln_ctx ctx = NULL; CHKN(ctx = ln_v1_inherittedCtx(parent)); load_generated_parser_samples(ctx, field_descr, field_descr_len, remaining_field, strlen(remaining_field)); load_generated_parser_samples(ctx, field_descr, field_descr_len, "", 0); r = 0; done: if (r != 0) { ln_exitCtx(ctx); ctx = NULL; } return ctx; } static ln_fieldList_t* parse_tokenized_content_field(ln_ctx ctx, const char* field_descr, size_t field_descr_len) { es_str_t* tmp = NULL; es_str_t* descr = NULL; ln_fieldList_t *node = NULL; int r = 0; CHKN(tmp = es_newStr(80)); CHKN(descr = es_newStr(80)); const char* field_prefix = "%" DEFAULT_MATCHED_FIELD_NAME ":"; CHKR(es_addBuf(&descr, field_prefix, strlen(field_prefix))); CHKR(es_addBuf(&descr, field_descr, field_descr_len)); CHKR(es_addChar(&descr, '%')); es_size_t offset = 0; CHKN(node = ln_v1_parseFieldDescr(ctx, descr, &offset, &tmp, &r)); if (offset != es_strlen(descr)) FAIL(LN_BADPARSERSTATE); done: if (r != 0) { if (node != NULL) ln_deletePTreeNode(node); node = NULL; } if (descr != NULL) es_deleteStr(descr); if (tmp != NULL) es_deleteStr(tmp); return node; } void* tokenized_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { int r = LN_BADCONFIG; char* name = es_str2cstr(node->name, NULL); pcons_args_t *args = NULL; tokenized_parser_data_t *pData = NULL; const char *field_descr = NULL; ln_fieldList_t* field = NULL; const char *tok = NULL; CHKN(args = pcons_args(node->raw_data, 2)); CHKN(pData = calloc(1, sizeof(tokenized_parser_data_t))); pcons_unescape_arg(args, 0); CHKN(tok = pcons_arg(args, 0, NULL)); CHKN(pData->tok_str = es_newStrFromCStr(tok, strlen(tok))); es_unescapeStr(pData->tok_str); CHKN(field_descr = pcons_arg(args, 1, NULL)); const int field_descr_len = strlen(field_descr); pData->free_ctx = 1; CHKN(field = parse_tokenized_content_field(ctx, field_descr, field_descr_len)); if (field->parser == ln_parseRecursive) { pData->use_default_field = 0; struct recursive_parser_data_s *dat = (struct recursive_parser_data_s*) field->parser_data; if (dat != NULL) { CHKN(pData->remaining_field = strdup(dat->remaining_field)); pData->free_ctx = dat->free_ctx; pData->ctx = dat->ctx; dat->free_ctx = 0; } } else { pData->use_default_field = 1; CHKN(pData->ctx = generate_context_with_field_as_prefix(ctx, field_descr, field_descr_len)); } if (pData->remaining_field == NULL) CHKN(pData->remaining_field = strdup(DEFAULT_REMAINING_FIELD_NAME)); r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for tokenized-field name"); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for parser-data for field: %s", name); else if (tok == NULL) ln_dbgprintf(ctx, "token-separator not provided for field: %s", name); else if (pData->tok_str == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for token-separator " "for field: %s", name); else if (field_descr == NULL) ln_dbgprintf(ctx, "field-type not provided for field: %s", name); else if (field == NULL) ln_dbgprintf(ctx, "couldn't resolve single-token field-type for tokenized field: %s", name); else if (pData->ctx == NULL) ln_dbgprintf(ctx, "couldn't initialize normalizer-context for field: %s", name); else if (pData->remaining_field == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for " "remaining-field-name for field: %s", name); if (pData) tokenized_parser_data_destructor((void**) &pData); } if (name != NULL) free(name); if (field != NULL) ln_deletePTreeNode(field); if (args) free_pcons_args(&args); return pData; } #ifdef FEATURE_REGEXP /** * Parse string matched by provided posix extended regex. * * Please note that using regex field in most cases will be * significantly slower than other field-types. */ struct regex_parser_data_s { pcre *re; int consume_group; int return_group; int max_groups; }; PARSER(Regex) assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); unsigned int* ovector = NULL; struct regex_parser_data_s *pData = (struct regex_parser_data_s*) node->parser_data; if (pData != NULL) { ovector = calloc(pData->max_groups, sizeof(unsigned int) * 3); if (ovector == NULL) FAIL(LN_NOMEM); int result = pcre_exec(pData->re, NULL, str, strLen, *offs, 0, (int*) ovector, pData->max_groups * 3); if (result == 0) result = pData->max_groups; if (result > pData->consume_group) { /*please check 'man 3 pcreapi' for cryptic '2 * n' and '2 * n + 1' magic*/ if (ovector[2 * pData->consume_group] == *offs) { *parsed = ovector[2 * pData->consume_group + 1] - ovector[2 * pData->consume_group]; if (pData->consume_group != pData->return_group) { char* val = NULL; if((val = strndup(str + ovector[2 * pData->return_group], ovector[2 * pData->return_group + 1] - ovector[2 * pData->return_group])) == NULL) { free(ovector); FAIL(LN_NOMEM); } *value = json_object_new_string(val); free(val); if (*value == NULL) { free(ovector); FAIL(LN_NOMEM); } } } } free(ovector); } r = 0; /* success */ done: return r; } static const char* regex_parser_configure_consume_and_return_group(pcons_args_t* args, struct regex_parser_data_s *pData) { const char* consume_group_parse_error = "couldn't parse consume-group number"; const char* return_group_parse_error = "couldn't parse return-group number"; char* tmp = NULL; const char* consume_grp_str = NULL; const char* return_grp_str = NULL; if ((consume_grp_str = pcons_arg(args, 1, "0")) == NULL || strlen(consume_grp_str) == 0) return consume_group_parse_error; if ((return_grp_str = pcons_arg(args, 2, consume_grp_str)) == NULL || strlen(return_grp_str) == 0) return return_group_parse_error; errno = 0; pData->consume_group = strtol(consume_grp_str, &tmp, 10); if (errno != 0 || strlen(tmp) != 0) return consume_group_parse_error; pData->return_group = strtol(return_grp_str, &tmp, 10); if (errno != 0 || strlen(tmp) != 0) return return_group_parse_error; return NULL; } void* regex_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { int r = LN_BADCONFIG; char* exp = NULL; const char* grp_parse_err = NULL; pcons_args_t* args = NULL; char* name = NULL; struct regex_parser_data_s *pData = NULL; const char *unescaped_exp = NULL; const char *error = NULL; int erroffset = 0; CHKN(name = es_str2cstr(node->name, NULL)); if (! ctx->opts & LN_CTXOPT_ALLOW_REGEX) FAIL(LN_BADCONFIG); CHKN(pData = malloc(sizeof(struct regex_parser_data_s))); pData->re = NULL; CHKN(args = pcons_args(node->raw_data, 3)); pData->consume_group = pData->return_group = 0; CHKN(unescaped_exp = pcons_arg(args, 0, NULL)); pcons_unescape_arg(args, 0); CHKN(exp = pcons_arg_copy(args, 0, NULL)); if ((grp_parse_err = regex_parser_configure_consume_and_return_group(args, pData)) != NULL) FAIL(LN_BADCONFIG); CHKN(pData->re = pcre_compile(exp, 0, &error, &erroffset, NULL)); pData->max_groups = ((pData->consume_group > pData->return_group) ? pData->consume_group : pData->return_group) + 1; r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory regex-field name"); else if (! ctx->opts & LN_CTXOPT_ALLOW_REGEX) ln_dbgprintf(ctx, "regex support is not enabled for: '%s' " "(please check lognorm context initialization)", name); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for parser-data for field: %s", name); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (unescaped_exp == NULL) ln_dbgprintf(ctx, "regular-expression missing for field: '%s'", name); else if (exp == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for regex-string for field: '%s'", name); else if (grp_parse_err != NULL) ln_dbgprintf(ctx, "%s for: '%s'", grp_parse_err, name); else if (pData->re == NULL) ln_dbgprintf(ctx, "couldn't compile regex(encountered error '%s' at char '%d' in pattern) " "for regex-matched field: '%s'", error, erroffset, name); regex_parser_data_destructor((void**)&pData); } if (exp != NULL) free(exp); if (args != NULL) free_pcons_args(&args); if (name != NULL) free(name); return pData; } void regex_parser_data_destructor(void** dataPtr) { if ((*dataPtr) != NULL) { struct regex_parser_data_s *pData = (struct regex_parser_data_s*) *dataPtr; if (pData->re != NULL) pcre_free(pData->re); free(pData); *dataPtr = NULL; } } #endif /** * Parse yet-to-be-matched portion of string by re-applying * top-level rules again. */ typedef enum interpret_type { /* If you change this, be sure to update json_type_to_name() too */ it_b10int, it_b16int, it_floating_pt, it_boolean } interpret_type; struct interpret_parser_data_s { ln_ctx ctx; enum interpret_type intrprt; }; static json_object* interpret_as_int(json_object *value, int base) { if (json_object_is_type(value, json_type_string)) { return json_object_new_int64(strtol(json_object_get_string(value), NULL, base)); } else if (json_object_is_type(value, json_type_int)) { return value; } else { return NULL; } } static json_object* interpret_as_double(json_object *value) { double val = json_object_get_double(value); return json_object_new_double(val); } static json_object* interpret_as_boolean(json_object *value) { json_bool val; if (json_object_is_type(value, json_type_string)) { const char* str = json_object_get_string(value); val = (strcasecmp(str, "false") == 0 || strcasecmp(str, "no") == 0) ? 0 : 1; } else { val = json_object_get_boolean(value); } return json_object_new_boolean(val); } static int reinterpret_value(json_object **value, enum interpret_type to_type) { switch(to_type) { case it_b10int: *value = interpret_as_int(*value, 10); break; case it_b16int: *value = interpret_as_int(*value, 16); break; case it_floating_pt: *value = interpret_as_double(*value); break; case it_boolean: *value = interpret_as_boolean(*value); break; default: return 0; } return 1; } PARSER(Interpret) assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); json_object *unparsed = NULL; json_object *parsed_raw = NULL; struct interpret_parser_data_s* pData = (struct interpret_parser_data_s*) node->parser_data; if (pData != NULL) { int remaining_len = strLen - *offs; const char *remaining_str = str + *offs; CHKN(parsed_raw = json_object_new_object()); ln_normalize(pData->ctx, remaining_str, remaining_len, &parsed_raw); if (json_object_object_get_ex(parsed_raw, UNPARSED_DATA_KEY, NULL)) { *parsed = 0; } else { json_object_object_get_ex(parsed_raw, DEFAULT_MATCHED_FIELD_NAME, value); json_object_object_get_ex(parsed_raw, DEFAULT_REMAINING_FIELD_NAME, &unparsed); if (reinterpret_value(value, pData->intrprt)) { *parsed = strLen - *offs - json_object_get_string_len(unparsed); } } json_object_put(parsed_raw); } r = 0; /* success */ done: return r; } void* interpret_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { int r = LN_BADCONFIG; char* name = NULL; struct interpret_parser_data_s *pData = NULL; pcons_args_t *args = NULL; int bad_interpret = 0; const char* type_str = NULL; const char *field_type = NULL; CHKN(name = es_str2cstr(node->name, NULL)); CHKN(pData = calloc(1, sizeof(struct interpret_parser_data_s))); CHKN(args = pcons_args(node->raw_data, 2)); CHKN(type_str = pcons_arg(args, 0, NULL)); if (strcmp(type_str, "int") == 0 || strcmp(type_str, "base10int") == 0) { pData->intrprt = it_b10int; } else if (strcmp(type_str, "base16int") == 0) { pData->intrprt = it_b16int; } else if (strcmp(type_str, "float") == 0) { pData->intrprt = it_floating_pt; } else if (strcmp(type_str, "bool") == 0) { pData->intrprt = it_boolean; } else { bad_interpret = 1; FAIL(LN_BADCONFIG); } CHKN(field_type = pcons_arg(args, 1, NULL)); CHKN(pData->ctx = generate_context_with_field_as_prefix(ctx, field_type, strlen(field_type))); r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for interpret-field name"); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for parser-data for field: %s", name); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (type_str == NULL) ln_dbgprintf(ctx, "no type provided for interpretation of field: %s", name); else if (bad_interpret != 0) ln_dbgprintf(ctx, "interpretation to unknown type '%s' requested for field: %s", type_str, name); else if (field_type == NULL) ln_dbgprintf(ctx, "field-type to actually match the content not provided for " "field: %s", name); else if (pData->ctx == NULL) ln_dbgprintf(ctx, "couldn't instantiate the normalizer context for matching " "field: %s", name); interpret_parser_data_destructor((void**) &pData); } free(name); free_pcons_args(&args); return pData; } void interpret_parser_data_destructor(void** dataPtr) { if (*dataPtr != NULL) { struct interpret_parser_data_s *pData = (struct interpret_parser_data_s*) *dataPtr; if (pData->ctx != NULL) ln_exitCtx(pData->ctx); free(pData); *dataPtr = NULL; } }; /** * Parse suffixed char-sequence, where suffix is one of many possible suffixes. */ struct suffixed_parser_data_s { int nsuffix; int *suffix_offsets; int *suffix_lengths; char* suffixes_str; ln_ctx ctx; char* value_field_name; char* suffix_field_name; }; PARSER(Suffixed) { assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); json_object *unparsed = NULL; json_object *parsed_raw = NULL; json_object *parsed_value = NULL; json_object *result = NULL; json_object *suffix = NULL; struct suffixed_parser_data_s *pData = (struct suffixed_parser_data_s*) node->parser_data; if (pData != NULL) { int remaining_len = strLen - *offs; const char *remaining_str = str + *offs; int i; CHKN(parsed_raw = json_object_new_object()); ln_normalize(pData->ctx, remaining_str, remaining_len, &parsed_raw); if (json_object_object_get_ex(parsed_raw, UNPARSED_DATA_KEY, NULL)) { *parsed = 0; } else { json_object_object_get_ex(parsed_raw, DEFAULT_MATCHED_FIELD_NAME, &parsed_value); json_object_object_get_ex(parsed_raw, DEFAULT_REMAINING_FIELD_NAME, &unparsed); const char* unparsed_frag = json_object_get_string(unparsed); for(i = 0; i < pData->nsuffix; i++) { const char* possible_suffix = pData->suffixes_str + pData->suffix_offsets[i]; int len = pData->suffix_lengths[i]; if (strncmp(possible_suffix, unparsed_frag, len) == 0) { CHKN(result = json_object_new_object()); CHKN(suffix = json_object_new_string(possible_suffix)); json_object_get(parsed_value); json_object_object_add(result, pData->value_field_name, parsed_value); json_object_object_add(result, pData->suffix_field_name, suffix); *parsed = strLen - *offs - json_object_get_string_len(unparsed) + len; break; } } if (result != NULL) { *value = result; } } } FAILParser if (r != 0) { if (result != NULL) json_object_put(result); } if (parsed_raw != NULL) json_object_put(parsed_raw); } ENDFailParser static struct suffixed_parser_data_s* _suffixed_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx, es_str_t* raw_args, const char* value_field, const char* suffix_field) { int r = LN_BADCONFIG; pcons_args_t* args = NULL; char* name = NULL; struct suffixed_parser_data_s *pData = NULL; const char *escaped_tokenizer = NULL; const char *uncopied_suffixes_str = NULL; const char *tokenizer = NULL; char *suffixes_str = NULL; const char *field_type = NULL; char *tok_saveptr = NULL; char *tok_input = NULL; int i = 0; char *tok = NULL; CHKN(name = es_str2cstr(node->name, NULL)); CHKN(pData = calloc(1, sizeof(struct suffixed_parser_data_s))); if (value_field == NULL) value_field = "value"; if (suffix_field == NULL) suffix_field = "suffix"; pData->value_field_name = strdup(value_field); pData->suffix_field_name = strdup(suffix_field); CHKN(args = pcons_args(raw_args, 3)); CHKN(escaped_tokenizer = pcons_arg(args, 0, NULL)); pcons_unescape_arg(args, 0); CHKN(tokenizer = pcons_arg(args, 0, NULL)); CHKN(uncopied_suffixes_str = pcons_arg(args, 1, NULL)); pcons_unescape_arg(args, 1); CHKN(suffixes_str = pcons_arg_copy(args, 1, NULL)); tok_input = suffixes_str; while (strtok_r(tok_input, tokenizer, &tok_saveptr) != NULL) { tok_input = NULL; pData->nsuffix++; } if (pData->nsuffix == 0) { FAIL(LN_INVLDFDESCR); } CHKN(pData->suffix_offsets = calloc(pData->nsuffix, sizeof(int))); CHKN(pData->suffix_lengths = calloc(pData->nsuffix, sizeof(int))); CHKN(pData->suffixes_str = pcons_arg_copy(args, 1, NULL)); tok_input = pData->suffixes_str; while ((tok = strtok_r(tok_input, tokenizer, &tok_saveptr)) != NULL) { tok_input = NULL; pData->suffix_offsets[i] = tok - pData->suffixes_str; pData->suffix_lengths[i++] = strlen(tok); } CHKN(field_type = pcons_arg(args, 2, NULL)); CHKN(pData->ctx = generate_context_with_field_as_prefix(ctx, field_type, strlen(field_type))); r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory suffixed-field name"); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for parser-data for field: %s", name); else if (pData->value_field_name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for value-field's name for field: %s", name); else if (pData->suffix_field_name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for suffix-field's name for field: %s", name); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (escaped_tokenizer == NULL) ln_dbgprintf(ctx, "suffix token-string missing for field: '%s'", name); else if (tokenizer == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for unescaping token-string for field: '%s'", name); else if (uncopied_suffixes_str == NULL) ln_dbgprintf(ctx, "suffix-list missing for field: '%s'", name); else if (suffixes_str == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for suffix-list for field: '%s'", name); else if (pData->nsuffix == 0) ln_dbgprintf(ctx, "could't read suffix-value(s) for field: '%s'", name); else if (pData->suffix_offsets == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for suffix-list element references for field: " "'%s'", name); else if (pData->suffix_lengths == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for suffix-list element lengths for field: '%s'", name); else if (pData->suffixes_str == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for suffix-list for field: '%s'", name); else if (field_type == NULL) ln_dbgprintf(ctx, "field-type declaration missing for field: '%s'", name); else if (pData->ctx == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for normalizer-context for field: '%s'", name); suffixed_parser_data_destructor((void**)&pData); } free_pcons_args(&args); if (suffixes_str != NULL) free(suffixes_str); if (name != NULL) free(name); return pData; } void* suffixed_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { return _suffixed_parser_data_constructor(node, ctx, node->raw_data, NULL, NULL); } void* named_suffixed_parser_data_constructor(ln_fieldList_t *node, ln_ctx ctx) { int r = LN_BADCONFIG; pcons_args_t* args = NULL; char* name = NULL; const char* value_field_name = NULL; const char* suffix_field_name = NULL; const char* remaining_args = NULL; es_str_t* unnamed_suffix_args = NULL; struct suffixed_parser_data_s* pData = NULL; CHKN(name = es_str2cstr(node->name, NULL)); CHKN(args = pcons_args(node->raw_data, 3)); CHKN(value_field_name = pcons_arg(args, 0, NULL)); CHKN(suffix_field_name = pcons_arg(args, 1, NULL)); CHKN(remaining_args = pcons_arg(args, 2, NULL)); CHKN(unnamed_suffix_args = es_newStrFromCStr(remaining_args, strlen(remaining_args))); CHKN(pData = _suffixed_parser_data_constructor(node, ctx, unnamed_suffix_args, value_field_name, suffix_field_name)); r = 0; done: if (r != 0) { if (name == NULL) ln_dbgprintf(ctx, "couldn't allocate memory named_suffixed-field name"); else if (args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for argument-parsing for field: %s", name); else if (value_field_name == NULL) ln_dbgprintf(ctx, "key-name for value not provided for field: %s", name); else if (suffix_field_name == NULL) ln_dbgprintf(ctx, "key-name for suffix not provided for field: %s", name); else if (unnamed_suffix_args == NULL) ln_dbgprintf(ctx, "couldn't allocate memory for unnamed-suffix-field args for field: %s", name); else if (pData == NULL) ln_dbgprintf(ctx, "couldn't create parser-data for field: %s", name); suffixed_parser_data_destructor((void**)&pData); } if (unnamed_suffix_args != NULL) free(unnamed_suffix_args); if (args != NULL) free_pcons_args(&args); if (name != NULL) free(name); return pData; } void suffixed_parser_data_destructor(void** dataPtr) { if ((*dataPtr) != NULL) { struct suffixed_parser_data_s *pData = (struct suffixed_parser_data_s*) *dataPtr; if (pData->suffixes_str != NULL) free(pData->suffixes_str); if (pData->suffix_offsets != NULL) free(pData->suffix_offsets); if (pData->suffix_lengths != NULL) free(pData->suffix_lengths); if (pData->value_field_name != NULL) free(pData->value_field_name); if (pData->suffix_field_name != NULL) free(pData->suffix_field_name); if (pData->ctx != NULL) ln_exitCtx(pData->ctx); free(pData); *dataPtr = NULL; } } /** * Just get everything till the end of string. */ PARSER(Rest) assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); /* silence the warning about unused variable */ (void)str; /* success, persist */ *parsed = strLen - *offs; r = 0; return r; } /** * Parse a possibly quoted string. In this initial implementation, escaping of the quote * char is not supported. A quoted string is one start starts with a double quote, * has some text (not containing double quotes) and ends with the first double * quote character seen. The extracted string does NOT include the quote characters. * swisskid, 2015-01-21 */ PARSER(OpQuotedString) const char *c; size_t i; char *cstr = NULL; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(c[i] != '"') { while(i < strLen && c[i] != ' ') i++; if(i == *offs) goto done; /* success, persist */ *parsed = i - *offs; /* create JSON value to save quoted string contents */ CHKN(cstr = strndup((char*)c + *offs, *parsed)); } else { ++i; /* search end of string */ while(i < strLen && c[i] != '"') i++; if(i == strLen || c[i] != '"') goto done; /* success, persist */ *parsed = i + 1 - *offs; /* "eat" terminal double quote */ /* create JSON value to save quoted string contents */ CHKN(cstr = strndup((char*)c + *offs + 1, *parsed - 2)); } CHKN(*value = json_object_new_string(cstr)); r = 0; /* success */ done: free(cstr); return r; } /** * Parse a quoted string. In this initial implementation, escaping of the quote * char is not supported. A quoted string is one start starts with a double quote, * has some text (not containing double quotes) and ends with the first double * quote character seen. The extracted string does NOT include the quote characters. * rgerhards, 2011-01-14 */ PARSER(QuotedString) const char *c; size_t i; char *cstr = NULL; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(i + 2 > strLen) goto done; /* needs at least 2 characters */ if(c[i] != '"') goto done; ++i; /* search end of string */ while(i < strLen && c[i] != '"') i++; if(i == strLen || c[i] != '"') goto done; /* success, persist */ *parsed = i + 1 - *offs; /* "eat" terminal double quote */ /* create JSON value to save quoted string contents */ CHKN(cstr = strndup((char*)c + *offs + 1, *parsed - 2)); CHKN(*value = json_object_new_string(cstr)); r = 0; /* success */ done: free(cstr); return r; } /** * Parse an ISO date, that is YYYY-MM-DD (exactly this format). * Note: we do manual loop unrolling -- this is fast AND efficient. * rgerhards, 2011-01-14 */ PARSER(ISODate) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(*offs+10 > strLen) goto done; /* if it is not 10 chars, it can't be an ISO date */ /* year */ if(!isdigit(c[i])) goto done; if(!isdigit(c[i+1])) goto done; if(!isdigit(c[i+2])) goto done; if(!isdigit(c[i+3])) goto done; if(c[i+4] != '-') goto done; /* month */ if(c[i+5] == '0') { if(c[i+6] < '1' || c[i+6] > '9') goto done; } else if(c[i+5] == '1') { if(c[i+6] < '0' || c[i+6] > '2') goto done; } else { goto done; } if(c[i+7] != '-') goto done; /* day */ if(c[i+8] == '0') { if(c[i+9] < '1' || c[i+9] > '9') goto done; } else if(c[i+8] == '1' || c[i+8] == '2') { if(!isdigit(c[i+9])) goto done; } else if(c[i+8] == '3') { if(c[i+9] != '0' && c[i+9] != '1') goto done; } else { goto done; } /* success, persist */ *parsed = 10; r = 0; /* success */ done: return r; } /** * Parse a Cisco interface spec. Sample for such a spec are: * outside:192.168.52.102/50349 * inside:192.168.1.15/56543 (192.168.1.112/54543) * outside:192.168.1.13/50179 (192.168.1.13/50179)(LOCAL\some.user) * outside:192.168.1.25/41850(LOCAL\RG-867G8-DEL88D879BBFFC8) * inside:192.168.1.25/53 (192.168.1.25/53) (some.user) * 192.168.1.15/0(LOCAL\RG-867G8-DEL88D879BBFFC8) * From this, we conclude the format is: * [interface:]ip/port [SP (ip2/port2)] [[SP](username)] * In order to match, this syntax must start on a non-whitespace char * other than colon. */ PARSER(CiscoInterfaceSpec) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(c[i] == ':' || isspace(c[i])) goto done; /* first, check if we have an interface. We do this by trying * to detect if we have an IP. If we have, obviously no interface * is present. Otherwise, we check if we have a valid interface. */ int bHaveInterface = 0; size_t idxInterface = 0; size_t lenInterface = 0; int bHaveIP = 0; size_t lenIP; size_t idxIP = i; if(ln_parseIPv4(str, strLen, &i, node, &lenIP, NULL) == 0) { bHaveIP = 1; i += lenIP - 1; /* position on delimiter */ } else { idxInterface = i; while(i < strLen) { if(isspace(c[i])) goto done; if(c[i] == ':') break; ++i; } lenInterface = i - idxInterface; bHaveInterface = 1; } if(i == strLen) goto done; ++i; /* skip over colon */ /* we now utilize our other parser helpers */ if(!bHaveIP) { idxIP = i; if(ln_parseIPv4(str, strLen, &i, node, &lenIP, NULL) != 0) goto done; i += lenIP; } if(i == strLen || c[i] != '/') goto done; ++i; /* skip slash */ const size_t idxPort = i; size_t lenPort; if(ln_parseNumber(str, strLen, &i, node, &lenPort, NULL) != 0) goto done; i += lenPort; if(i == strLen) goto success; /* check if optional second ip/port is present * We assume we must at least have 5 chars [" (::1)"] */ int bHaveIP2 = 0; size_t idxIP2 = 0, lenIP2 = 0; size_t idxPort2 = 0, lenPort2 = 0; if(i+5 < strLen && c[i] == ' ' && c[i+1] == '(') { size_t iTmp = i+2; /* skip over " (" */ idxIP2 = iTmp; if(ln_parseIPv4(str, strLen, &iTmp, node, &lenIP2, NULL) == 0) { iTmp += lenIP2; if(i < strLen || c[iTmp] == '/') { ++iTmp; /* skip slash */ idxPort2 = iTmp; if(ln_parseNumber(str, strLen, &iTmp, node, &lenPort2, NULL) == 0) { iTmp += lenPort2; if(iTmp < strLen && c[iTmp] == ')') { i = iTmp + 1; /* match, so use new index */ bHaveIP2 = 1; } } } } } /* check if optional username is present * We assume we must at least have 3 chars ["(n)"] */ int bHaveUser = 0; size_t idxUser = 0; size_t lenUser = 0; if( (i+2 < strLen && c[i] == '(' && !isspace(c[i+1]) ) || (i+3 < strLen && c[i] == ' ' && c[i+1] == '(' && !isspace(c[i+2])) ) { idxUser = i + ((c[i] == ' ') ? 2 : 1); /* skip [SP]'(' */ size_t iTmp = idxUser; while(iTmp < strLen && !isspace(c[iTmp]) && c[iTmp] != ')') ++iTmp; /* just scan */ if(iTmp < strLen && c[iTmp] == ')') { i = iTmp + 1; /* we have a match, so use new index */ bHaveUser = 1; lenUser = iTmp - idxUser; } } /* all done, save data */ if(value == NULL) goto success; CHKN(*value = json_object_new_object()); json_object *json; if(bHaveInterface) { CHKN(json = json_object_new_string_len(c+idxInterface, lenInterface)); json_object_object_add_ex(*value, "interface", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); } CHKN(json = json_object_new_string_len(c+idxIP, lenIP)); json_object_object_add_ex(*value, "ip", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); CHKN(json = json_object_new_string_len(c+idxPort, lenPort)); json_object_object_add_ex(*value, "port", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); if(bHaveIP2) { CHKN(json = json_object_new_string_len(c+idxIP2, lenIP2)); json_object_object_add_ex(*value, "ip2", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); CHKN(json = json_object_new_string_len(c+idxPort2, lenPort2)); json_object_object_add_ex(*value, "port2", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); } if(bHaveUser) { CHKN(json = json_object_new_string_len(c+idxUser, lenUser)); json_object_object_add_ex(*value, "user", json, JSON_C_OBJECT_ADD_KEY_IS_NEW|JSON_C_OBJECT_KEY_IS_CONSTANT); } success: /* success, persist */ *parsed = i - *offs; r = 0; /* success */ done: if(r != 0 && value != NULL && *value != NULL) { json_object_put(*value); *value = NULL; /* to be on the save side */ } return r; } /** * Parse a duration. A duration is similar to a timestamp, except that * it tells about time elapsed. As such, hours can be larger than 23 * and hours may also be specified by a single digit (this, for example, * is commonly done in Cisco software). * Note: we do manual loop unrolling -- this is fast AND efficient. */ PARSER(Duration) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; /* hour is a bit tricky */ if(!isdigit(c[i])) goto done; ++i; if(isdigit(c[i])) ++i; if(c[i] == ':') ++i; else goto done; if(i+5 > strLen) goto done;/* if it is not 5 chars from here, it can't be us */ if(c[i] < '0' || c[i] > '5') goto done; if(!isdigit(c[i+1])) goto done; if(c[i+2] != ':') goto done; if(c[i+3] < '0' || c[i+3] > '5') goto done; if(!isdigit(c[i+4])) goto done; /* success, persist */ *parsed = (i + 5) - *offs; r = 0; /* success */ done: return r; } /** * Parse a timestamp in 24hr format (exactly HH:MM:SS). * Note: we do manual loop unrolling -- this is fast AND efficient. * rgerhards, 2011-01-14 */ PARSER(Time24hr) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(*offs+8 > strLen) goto done; /* if it is not 8 chars, it can't be us */ /* hour */ if(c[i] == '0' || c[i] == '1') { if(!isdigit(c[i+1])) goto done; } else if(c[i] == '2') { if(c[i+1] < '0' || c[i+1] > '3') goto done; } else { goto done; } /* TODO: the code below is a duplicate of 24hr parser - create common function */ if(c[i+2] != ':') goto done; if(c[i+3] < '0' || c[i+3] > '5') goto done; if(!isdigit(c[i+4])) goto done; if(c[i+5] != ':') goto done; if(c[i+6] < '0' || c[i+6] > '5') goto done; if(!isdigit(c[i+7])) goto done; /* success, persist */ *parsed = 8; r = 0; /* success */ done: return r; } /** * Parse a timestamp in 12hr format (exactly HH:MM:SS). * Note: we do manual loop unrolling -- this is fast AND efficient. * TODO: the code below is a duplicate of 24hr parser - create common function? * rgerhards, 2011-01-14 */ PARSER(Time12hr) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); c = str; i = *offs; if(*offs+8 > strLen) goto done; /* if it is not 8 chars, it can't be us */ /* hour */ if(c[i] == '0') { if(!isdigit(c[i+1])) goto done; } else if(c[i] == '1') { if(c[i+1] < '0' || c[i+1] > '2') goto done; } else { goto done; } if(c[i+2] != ':') goto done; if(c[i+3] < '0' || c[i+3] > '5') goto done; if(!isdigit(c[i+4])) goto done; if(c[i+5] != ':') goto done; if(c[i+6] < '0' || c[i+6] > '5') goto done; if(!isdigit(c[i+7])) goto done; /* success, persist */ *parsed = 8; r = 0; /* success */ done: return r; } /* helper to IPv4 address parser, checks the next set of numbers. * Syntax 1 to 3 digits, value together not larger than 255. * @param[in] str parse buffer * @param[in/out] offs offset into buffer, updated if successful * @return 0 if OK, 1 otherwise */ static int chkIPv4AddrByte(const char *str, size_t strLen, size_t *offs) { int val = 0; int r = 1; /* default: done -- simplifies things */ const char *c; size_t i = *offs; c = str; if(i == strLen || !isdigit(c[i])) goto done; val = c[i++] - '0'; if(i < strLen && isdigit(c[i])) { val = val * 10 + c[i++] - '0'; if(i < strLen && isdigit(c[i])) val = val * 10 + c[i++] - '0'; } if(val > 255) /* cannot be a valid IP address byte! */ goto done; *offs = i; r = 0; done: return r; } /** * Parser for IPv4 addresses. */ PARSER(IPv4) const char *c; size_t i; assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); i = *offs; if(i + 7 > strLen) { /* IPv4 addr requires at least 7 characters */ goto done; } c = str; /* byte 1*/ if(chkIPv4AddrByte(str, strLen, &i) != 0) goto done; if(i == strLen || c[i++] != '.') goto done; /* byte 2*/ if(chkIPv4AddrByte(str, strLen, &i) != 0) goto done; if(i == strLen || c[i++] != '.') goto done; /* byte 3*/ if(chkIPv4AddrByte(str, strLen, &i) != 0) goto done; if(i == strLen || c[i++] != '.') goto done; /* byte 4 - we do NOT need any char behind it! */ if(chkIPv4AddrByte(str, strLen, &i) != 0) goto done; /* if we reach this point, we found a valid IP address */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /* skip past the IPv6 address block, parse pointer is set to * first char after the block. Returns an error if already at end * of string. * @param[in] str parse buffer * @param[in/out] offs offset into buffer, updated if successful * @return 0 if OK, 1 otherwise */ static int skipIPv6AddrBlock(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ offs) { int j; if(*offs == strLen) return 1; for(j = 0 ; j < 4 && *offs+j < strLen && isxdigit(str[*offs+j]) ; ++j) /*just skip*/ ; *offs += j; return 0; } /** * Parser for IPv6 addresses. * Bases on RFC4291 Section 2.2. The address must be followed * by whitespace or end-of-string, else it is not considered * a valid address. This prevents false positives. */ PARSER(IPv6) const char *c; size_t i; size_t beginBlock; /* last block begin in case we need IPv4 parsing */ int hasIPv4 = 0; int nBlocks = 0; /* how many blocks did we already have? */ int bHad0Abbrev = 0; /* :: already used? */ assert(str != NULL); assert(offs != NULL); assert(parsed != NULL); i = *offs; if(i + 2 > strLen) { /* IPv6 addr requires at least 2 characters ("::") */ goto done; } c = str; /* check that first block is non-empty */ if(! ( isxdigit(c[i]) || (c[i] == ':' && c[i+1] == ':') ) ) goto done; /* try for all potential blocks plus one more (so we see errors!) */ for(int j = 0 ; j < 9 ; ++j) { beginBlock = i; if(skipIPv6AddrBlock(str, strLen, &i) != 0) goto done; nBlocks++; if(i == strLen) goto chk_ok; if(isspace(c[i])) goto chk_ok; if(c[i] == '.'){ /* IPv4 processing! */ hasIPv4 = 1; break; } if(c[i] != ':') goto done; i++; /* "eat" ':' */ if(i == strLen) goto chk_ok; /* check for :: */ if(bHad0Abbrev) { if(c[i] == ':') goto done; } else { if(c[i] == ':') { bHad0Abbrev = 1; ++i; if(i == strLen) goto chk_ok; } } } if(hasIPv4) { size_t ipv4_parsed; --nBlocks; /* prevent pure IPv4 address to be recognized */ if(beginBlock == *offs) goto done; i = beginBlock; if(ln_parseIPv4(str, strLen, &i, node, &ipv4_parsed, NULL) != 0) goto done; i += ipv4_parsed; } chk_ok: /* we are finished parsing, check if things are ok */ if(nBlocks > 8) goto done; if(bHad0Abbrev && nBlocks >= 8) goto done; /* now check if trailing block is missing. Note that i is already * on next character, so we need to go two back. Two are always * present, else we would not reach this code here. */ if(c[i-1] == ':' && c[i-2] != ':') goto done; /* if we reach this point, we found a valid IP address */ *parsed = i - *offs; r = 0; /* success */ done: return r; } /* check if a char is valid inside a name of the iptables motif. * We try to keep the set as slim as possible, because the iptables * parser may otherwise create a very broad match (especially the * inclusion of simple words like "DF" cause grief here). * Note: we have taken the permitted set from iptables log samples. * Report bugs if we missed some additional rules. */ static inline int isValidIPTablesNameChar(const char c) { /* right now, upper case only is valid */ return ('A' <= c && c <= 'Z') ? 1 : 0; } /* helper to iptables parser, parses out a a single name=value pair */ static int parseIPTablesNameValue(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ offs, struct json_object *const __restrict__ valroot) { int r = LN_WRONGPARSER; size_t i = *offs; char *name = NULL; const size_t iName = i; while(i < strLen && isValidIPTablesNameChar(str[i])) ++i; if(i == iName || (i < strLen && str[i] != '=' && str[i] != ' ')) goto done; /* no name at all! */ const ssize_t lenName = i - iName; ssize_t iVal = -1; size_t lenVal = i - iVal; if(i < strLen && str[i] != ' ') { /* we have a real value (not just a flag name like "DF") */ ++i; /* skip '=' */ iVal = i; while(i < strLen && !isspace(str[i])) ++i; lenVal = i - iVal; } /* parsing OK */ *offs = i; r = 0; if(valroot == NULL) goto done; CHKN(name = malloc(lenName+1)); memcpy(name, str+iName, lenName); name[lenName] = '\0'; json_object *json; if(iVal == -1) { json = NULL; } else { CHKN(json = json_object_new_string_len(str+iVal, lenVal)); } json_object_object_add(valroot, name, json); done: free(name); return r; } /** * Parser for iptables logs (the structured part). * This parser is named "v2-iptables" because of a traditional * parser named "iptables", which we do not want to replace, at * least right now (we may re-think this before the first release). * For performance reasons, this works in two stages. In the first * stage, we only detect if the motif is correct. The second stage is * only called when we know it is. In it, we go once again over the * message again and actually extract the data. This is done because * data extraction is relatively expensive and in most cases we will * have much more frequent mismatches than matches. * Note that this motif must have at least one field, otherwise it * could detect things that are not iptables to be it. Further limits * may be imposed in the future as we see additional need. * added 2015-04-30 rgerhards */ PARSER(v2IPTables) size_t i = *offs; int nfields = 0; /* stage one */ while(i < strLen) { CHKR(parseIPTablesNameValue(str, strLen, &i, NULL)); ++nfields; /* exactly one SP is permitted between fields */ if(i < strLen && str[i] == ' ') ++i; } if(nfields < 2) { FAIL(LN_WRONGPARSER); } /* success, persist */ *parsed = i - *offs; r = 0; /* stage two */ if(value == NULL) goto done; i = *offs; CHKN(*value = json_object_new_object()); while(i < strLen) { CHKR(parseIPTablesNameValue(str, strLen, &i, *value)); while(i < strLen && isspace(str[i])) ++i; } done: if(r != 0 && value != NULL && *value != NULL) { json_object_put(*value); *value = NULL; } return r; } /** * Parse JSON. This parser tries to find JSON data inside a message. * If it finds valid JSON, it will extract it. Extra data after the * JSON is permitted. * Note: the json-c JSON parser treats whitespace after the actual * json to be part of the json. So in essence, any whitespace is * processed by this parser. We use the same semantics to keep things * neatly in sync. If json-c changes for some reason or we switch to * an alternate json lib, we probably need to be sure to keep that * behaviour, and probably emulate it. * added 2015-04-28 by rgerhards, v1.1.2 */ PARSER(JSON) const size_t i = *offs; struct json_tokener *tokener = NULL; if(str[i] != '{' && str[i] != ']') { /* this can't be json, see RFC4627, Sect. 2 * see this bug in json-c: * https://github.com/json-c/json-c/issues/181 * In any case, it's better to do this quick check, * even if json-c did not have the bug because this * check here is much faster than calling the parser. */ goto done; } if((tokener = json_tokener_new()) == NULL) goto done; struct json_object *const json = json_tokener_parse_ex(tokener, str+i, (int) (strLen - i)); if(json == NULL) goto done; /* success, persist */ *parsed = (i + tokener->char_offset) - *offs; r = 0; /* success */ if(value == NULL) { json_object_put(json); } else { *value = json; } done: if(tokener != NULL) json_tokener_free(tokener); return r; } /* check if a char is valid inside a name of a NameValue list * The set of valid characters may be extended if there is good * need to do so. We have selected the current set carefully, but * may have overlooked some cases. */ static inline int isValidNameChar(const char c) { return (isalnum(c) || c == '.' || c == '_' || c == '-' ) ? 1 : 0; } /* helper to NameValue parser, parses out a a single name=value pair * * name must be alphanumeric characters, value must be non-whitespace * characters, if quoted than with symmetric quotes. Supported formats * - name=value * - name="value" * - name='value' * Note "name=" is valid and means a field with empty value. * TODO: so far, quote characters are not permitted WITHIN quoted values. */ static int parseNameValue(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ offs, struct json_object *const __restrict__ valroot) { int r = LN_WRONGPARSER; size_t i = *offs; char *name = NULL; const size_t iName = i; while(i < strLen && isValidNameChar(str[i])) ++i; if(i == iName || str[i] != '=') goto done; /* no name at all! */ const size_t lenName = i - iName; ++i; /* skip '=' */ const size_t iVal = i; while(i < strLen && !isspace(str[i])) ++i; const size_t lenVal = i - iVal; /* parsing OK */ *offs = i; r = 0; if(valroot == NULL) goto done; CHKN(name = malloc(lenName+1)); memcpy(name, str+iName, lenName); name[lenName] = '\0'; json_object *json; CHKN(json = json_object_new_string_len(str+iVal, lenVal)); json_object_object_add(valroot, name, json); done: free(name); return r; } /** * Parse CEE syslog. * This essentially is a JSON parser, with additional restrictions: * The message must start with "@cee:" and json must immediately follow (whitespace permitted). * after the JSON, there must be no other non-whitespace characters. * In other words: the message must consist of a single JSON object, * only. * added 2015-04-28 by rgerhards, v1.1.2 */ PARSER(CEESyslog) size_t i = *offs; struct json_tokener *tokener = NULL; struct json_object *json = NULL; if(strLen < i + 7 || /* "@cee:{}" is minimum text */ str[i] != '@' || str[i+1] != 'c' || str[i+2] != 'e' || str[i+3] != 'e' || str[i+4] != ':') goto done; /* skip whitespace */ for(i += 5 ; i < strLen && isspace(str[i]) ; ++i) /* just skip */; if(i == strLen || str[i] != '{') goto done; /* note: we do not permit arrays in CEE mode */ if((tokener = json_tokener_new()) == NULL) goto done; json = json_tokener_parse_ex(tokener, str+i, (int) (strLen - i)); if(json == NULL) goto done; if(i + tokener->char_offset != strLen) goto done; /* success, persist */ *parsed = strLen; r = 0; /* success */ if(value != NULL) { *value = json; json = NULL; /* do NOT free below! */ } done: if(tokener != NULL) json_tokener_free(tokener); if(json != NULL) json_object_put(json); return r; } /** * Parser for name/value pairs. * On entry must point to alnum char. All following chars must be * name/value pairs delimited by whitespace up until the end of string. * For performance reasons, this works in two stages. In the first * stage, we only detect if the motif is correct. The second stage is * only called when we know it is. In it, we go once again over the * message again and actually extract the data. This is done because * data extraction is relatively expensive and in most cases we will * have much more frequent mismatches than matches. * added 2015-04-25 rgerhards */ PARSER(NameValue) size_t i = *offs; /* stage one */ while(i < strLen) { CHKR(parseNameValue(str, strLen, &i, NULL)); while(i < strLen && isspace(str[i])) ++i; } /* success, persist */ *parsed = i - *offs; r = 0; /* success */ /* stage two */ if(value == NULL) goto done; i = *offs; CHKN(*value = json_object_new_object()); while(i < strLen) { CHKR(parseNameValue(str, strLen, &i, *value)); while(i < strLen && isspace(str[i])) ++i; } /* TODO: fix mem leak if alloc json fails */ done: return r; } /** * Parse a MAC layer address. * The standard (IEEE 802) format for printing MAC-48 addresses in * human-friendly form is six groups of two hexadecimal digits, * separated by hyphens (-) or colons (:), in transmission order * (e.g. 01-23-45-67-89-ab or 01:23:45:67:89:ab ). * This form is also commonly used for EUI-64. * from: http://en.wikipedia.org/wiki/MAC_address * * This parser must start on a hex digit. * added 2015-05-04 by rgerhards, v1.1.2 */ PARSER(MAC48) size_t i = *offs; char delim; if(strLen < i + 17 || /* this motif has exactly 17 characters */ !isxdigit(str[i]) || !isxdigit(str[i+1]) ) FAIL(LN_WRONGPARSER); if(str[i+2] == ':') delim = ':'; else if(str[i+2] == '-') delim = '-'; else FAIL(LN_WRONGPARSER); /* first byte ok */ if(!isxdigit(str[i+3]) || !isxdigit(str[i+4]) || str[i+5] != delim || /* 2nd byte ok */ !isxdigit(str[i+6]) || !isxdigit(str[i+7]) || str[i+8] != delim || /* 3rd byte ok */ !isxdigit(str[i+9]) || !isxdigit(str[i+10]) || str[i+11] != delim || /* 4th byte ok */ !isxdigit(str[i+12]) || !isxdigit(str[i+13]) || str[i+14] != delim || /* 5th byte ok */ !isxdigit(str[i+15]) || !isxdigit(str[i+16]) /* 6th byte ok */ ) FAIL(LN_WRONGPARSER); /* success, persist */ *parsed = 17; r = 0; /* success */ if(value != NULL) { CHKN(*value = json_object_new_string_len(str+i, 17)); } done: return r; } /* This parses the extension value and updates the index * to point to the end of it. */ static int cefParseExtensionValue(const char *const __restrict__ str, const size_t strLen, size_t *__restrict__ iEndVal) { int r = 0; size_t i = *iEndVal; size_t iLastWordBegin; /* first find next unquoted equal sign and record begin of * last word in front of it - this is the actual end of the * current name/value pair and the begin of the next one. */ int hadSP = 0; int inEscape = 0; for(iLastWordBegin = 0 ; i < strLen ; ++i) { if(inEscape) { if(str[i] != '=' && str[i] != '\\' && str[i] != 'r' && str[i] != 'n') FAIL(LN_WRONGPARSER); inEscape = 0; } else { if(str[i] == '=') { break; } else if(str[i] == '\\') { inEscape = 1; } else if(str[i] == ' ') { hadSP = 1; } else { if(hadSP) { iLastWordBegin = i; hadSP = 0; } } } } /* Note: iLastWordBegin can never be at offset zero, because * the CEF header starts there! */ if(i < strLen) { *iEndVal = (iLastWordBegin == 0) ? i : iLastWordBegin - 1; } else { *iEndVal = i; } done: return r; } /* must be positioned on first char of name, returns index * of end of name. * Note: ArcSight violates the CEF spec ifself: they generate * leading underscores in their extension names, which are * definetly not alphanumeric. We still accept them... * They also seem to use dots. */ static int cefParseName(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ i) { int r = 0; while(*i < strLen && str[*i] != '=') { if(!(isalnum(str[*i]) || str[*i] == '_' || str[*i] == '.')) FAIL(LN_WRONGPARSER); ++(*i); } done: return r; } /* parse CEF extensions. They are basically name=value * pairs with the ugly exception that values may contain * spaces but need NOT to be quoted. Thankfully, at least * names are specified as being alphanumeric without spaces * in them. So we must add a lookahead parser to check if * a word is a name (and thus the begin of a new pair) or * not. This is done by subroutines. */ static int cefParseExtensions(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ offs, json_object *const __restrict__ jroot) { int r = 0; size_t i = *offs; size_t iName, lenName; size_t iValue, lenValue; char *name = NULL; char *value = NULL; while(i < strLen) { while(i < strLen && str[i] == ' ') ++i; iName = i; CHKR(cefParseName(str, strLen, &i)); if(i+1 >= strLen || str[i] != '=') FAIL(LN_WRONGPARSER); lenName = i - iName; ++i; /* skip '=' */ iValue = i; CHKR(cefParseExtensionValue(str, strLen, &i)); lenValue = i - iValue; ++i; /* skip past value */ if(jroot != NULL) { CHKN(name = malloc(sizeof(char) * (lenName + 1))); memcpy(name, str+iName, lenName); name[lenName] = '\0'; CHKN(value = malloc(sizeof(char) * (lenValue + 1))); /* copy value but escape it */ size_t iDst = 0; for(size_t iSrc = 0 ; iSrc < lenValue ; ++iSrc) { if(str[iValue+iSrc] == '\\') { ++iSrc; /* we know the next char must exist! */ switch(str[iValue+iSrc]) { case '=': value[iDst] = '='; break; case 'n': value[iDst] = '\n'; break; case 'r': value[iDst] = '\r'; break; case '\\': value[iDst] = '\\'; break; default: break; } } else { value[iDst] = str[iValue+iSrc]; } ++iDst; } value[iDst] = '\0'; json_object *json; CHKN(json = json_object_new_string(value)); json_object_object_add(jroot, name, json); free(name); name = NULL; free(value); value = NULL; } } done: free(name); free(value); return r; } /* gets a CEF header field. Must be positioned on the * first char after the '|' in front of field. * Note that '|' may be escaped as "\|", which also means * we need to supprot "\\" (see CEF spec for details). * We return the string in *val, if val is non-null. In * that case we allocate memory that the caller must free. * This is necessary because there are potentially escape * sequences inside the string. */ static int cefGetHdrField(const char *const __restrict__ str, const size_t strLen, size_t *const __restrict__ offs, char **val) { int r = 0; size_t i = *offs; assert(str[i] != '|'); while(i < strLen && str[i] != '|') { if(str[i] == '\\') { ++i; /* skip esc char */ if(str[i] != '\\' && str[i] != '|') FAIL(LN_WRONGPARSER); } ++i; /* scan to next delimiter */ } if(str[i] != '|') FAIL(LN_WRONGPARSER); const size_t iBegin = *offs; /* success, persist */ *offs = i + 1; if(val == NULL) { r = 0; goto done; } const size_t len = i - iBegin; CHKN(*val = malloc(len + 1)); size_t iDst = 0; for(size_t iSrc = 0 ; iSrc < len ; ++iSrc) { if(str[iBegin+iSrc] == '\\') ++iSrc; /* we already checked above that this is OK! */ (*val)[iDst++] = str[iBegin+iSrc]; } (*val)[iDst] = 0; r = 0; done: return r; } /** * Parser for ArcSight Common Event Format (CEF) version 0. * added 2015-05-05 by rgerhards, v1.1.2 */ PARSER(CEF) size_t i = *offs; char *vendor = NULL; char *product = NULL; char *version = NULL; char *sigID = NULL; char *name = NULL; char *severity = NULL; /* minumum header: "CEF:0|x|x|x|x|x|x|" --> 17 chars */ if(strLen < i + 17 || str[i] != 'C' || str[i+1] != 'E' || str[i+2] != 'F' || str[i+3] != ':' || str[i+4] != '0' || str[i+5] != '|' ) FAIL(LN_WRONGPARSER); i += 6; /* position on '|' */ CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &vendor)); CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &product)); CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &version)); CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &sigID)); CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &name)); CHKR(cefGetHdrField(str, strLen, &i, (value == NULL) ? NULL : &severity)); ++i; /* skip over terminal '|' */ /* OK, we now know we have a good header. Now, we need * to process extensions. * This time, we do NOT pre-process the extension, but rather * persist them directly to JSON. This is contrary to other * parsers, but as the CEF header is pretty unique, this time * it is exteremely unlike we will get a no-match during * extension processing. Even if so, nothing bad happens, as * the extracted data is discarded. But the regular case saves * us processing time and complexity. The only time when we * cannot directly process it is when the caller asks us not * to persist the data. So this must be handled differently. */ size_t iBeginExtensions = i; CHKR(cefParseExtensions(str, strLen, &i, NULL)); /* success, persist */ *parsed = *offs - i; r = 0; /* success */ if(value != NULL) { CHKN(*value = json_object_new_object()); json_object *json; CHKN(json = json_object_new_string(vendor)); json_object_object_add(*value, "DeviceVendor", json); CHKN(json = json_object_new_string(product)); json_object_object_add(*value, "DeviceProduct", json); CHKN(json = json_object_new_string(version)); json_object_object_add(*value, "DeviceVersion", json); CHKN(json = json_object_new_string(sigID)); json_object_object_add(*value, "SignatureID", json); CHKN(json = json_object_new_string(name)); json_object_object_add(*value, "Name", json); CHKN(json = json_object_new_string(severity)); json_object_object_add(*value, "Severity", json); json_object *jext; CHKN(jext = json_object_new_object()); json_object_object_add(*value, "Extensions", jext); i = iBeginExtensions; cefParseExtensions(str, strLen, &i, jext); } done: if(r != 0 && value != NULL && *value != NULL) { json_object_put(*value); value = NULL; } free(vendor); free(product); free(version); free(sigID); free(name); free(severity); return r; } /** * Parser for Checkpoint LEA on-disk format. * added 2015-06-18 by rgerhards, v1.1.2 */ PARSER(CheckpointLEA) size_t i = *offs; size_t iName, lenName; size_t iValue, lenValue; int foundFields = 0; char *name = NULL; char *val = NULL; while(i < strLen) { while(i < strLen && str[i] == ' ') /* skip leading SP */ ++i; if(i == strLen) { /* OK if just trailing space */ if(foundFields == 0) FAIL(LN_WRONGPARSER); break; /* we are done with the loop, all processed */ } else { ++foundFields; } iName = i; /* TODO: do a stricter check? ... but we don't have a spec */ while(i < strLen && str[i] != ':') { ++i; } if(i+1 >= strLen || str[i] != ':') FAIL(LN_WRONGPARSER); lenName = i - iName; ++i; /* skip ':' */ while(i < strLen && str[i] == ' ') /* skip leading SP */ ++i; iValue = i; while(i < strLen && str[i] != ';') { ++i; } if(i+1 > strLen || str[i] != ';') FAIL(LN_WRONGPARSER); lenValue = i - iValue; ++i; /* skip ';' */ if(value != NULL) { CHKN(name = malloc(sizeof(char) * (lenName + 1))); memcpy(name, str+iName, lenName); name[lenName] = '\0'; CHKN(val = malloc(sizeof(char) * (lenValue + 1))); memcpy(val, str+iValue, lenValue); val[lenValue] = '\0'; if(*value == NULL) CHKN(*value = json_object_new_object()); json_object *json; CHKN(json = json_object_new_string(val)); json_object_object_add(*value, name, json); free(name); name = NULL; free(val); val = NULL; } } /* success, persist */ *parsed = *offs - i; r = 0; /* success */ done: free(name); free(val); if(r != 0 && value != NULL && *value != NULL) { json_object_put(*value); value = NULL; } return r; }