Blame dwarfdump/sanitized.c

Packit cdaae3
/*
Packit cdaae3
  Copyright 2016-2017 David Anderson. All rights reserved.
Packit cdaae3
Packit cdaae3
  This program is free software; you can redistribute it and/or modify it
Packit cdaae3
  under the terms of version 2 of the GNU General Public License as
Packit cdaae3
  published by the Free Software Foundation.
Packit cdaae3
Packit cdaae3
  This program is distributed in the hope that it would be useful, but
Packit cdaae3
  WITHOUT ANY WARRANTY; without even the implied warranty of
Packit cdaae3
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Packit cdaae3
Packit cdaae3
  Further, this software is distributed without any warranty that it is
Packit cdaae3
  free of the rightful claim of any third person regarding infringement
Packit cdaae3
  or the like.  Any license provided herein, whether implied or
Packit cdaae3
  otherwise, applies only to this software file.  Patent licenses, if
Packit cdaae3
  any, provided herein do not apply to combinations of this program with
Packit cdaae3
  other software, or any other product whatsoever.
Packit cdaae3
Packit cdaae3
  You should have received a copy of the GNU General Public License along
Packit cdaae3
  with this program; if not, write the Free Software Foundation, Inc., 51
Packit cdaae3
  Franklin Street - Fifth Floor, Boston MA 02110-1301, USA.
Packit cdaae3
Packit cdaae3
*/
Packit cdaae3
Packit cdaae3
#include "globals.h"
Packit cdaae3
#include "naming.h"
Packit cdaae3
#include "dwconf.h"
Packit cdaae3
#include "esb.h"
Packit cdaae3
Packit cdaae3
/*  This does a uri-style conversion of control characters.
Packit cdaae3
    So  SOH prints as %01 for example.
Packit cdaae3
    Which stops corrupted or crafted strings from
Packit cdaae3
    doing things to the terminal the string is routed to.
Packit cdaae3
Packit cdaae3
    We do not translate an input % to %% (as in real uri)
Packit cdaae3
    as that would be a bit confusing for most readers.
Packit cdaae3
Packit cdaae3
    The conversion makes it possble to print UTF-8 strings
Packit cdaae3
    reproducibly, sort of (not showing the
Packit cdaae3
    real glyph!).
Packit cdaae3
Packit cdaae3
    Only call this in a printf or sprintf, and
Packit cdaae3
    only call it once in any single printf/sprintf.
Packit cdaae3
    Othewise you will get bogus results and confusion. */
Packit cdaae3
Packit cdaae3
/* ASCII control codes:
Packit cdaae3
We leave newline as is, NUL is end of string,
Packit cdaae3
the others are translated.
Packit cdaae3
NUL Null             0  00              Ctrl-@ ^@
Packit cdaae3
SOH Start of heading 1  01      Alt-1   Ctrl-A ^A
Packit cdaae3
STX Start of text    2  02      Alt-2   Ctrl-B ^B
Packit cdaae3
ETX End of text	     3  03      Alt-3   Ctrl-C ^C
Packit cdaae3
EOT End of transmission	4 04    Alt-4   Ctrl-D ^D
Packit cdaae3
ENQ Enquiry          5    05    Alt-5   Ctrl-E ^E
Packit cdaae3
ACK Acknowledge	     6    06    Alt-6   Ctrl-F ^F
Packit cdaae3
BEL Bell             7    07    Alt-7   Ctrl-G ^G
Packit cdaae3
BS  Backspace        8    08    Alt-8   Ctrl-H ^H
Packit cdaae3
HT  Horizontal tab   9    09    Alt-9   Ctrl-I ^I
Packit cdaae3
LF  Line feed       10    0A    Alt-10  Ctrl-J ^J
Packit cdaae3
VT  Vertical tab    11    0B    Alt-11  Ctrl-K ^K
Packit cdaae3
FF  Form feed       12    0C    Alt-12  Ctrl-L ^L
Packit cdaae3
CR  Carriage return 13    0D    Alt-13  Ctrl-M ^M
Packit cdaae3
SO  Shift out       14    0E    Alt-14  Ctrl-N ^N
Packit cdaae3
SI  Shift in        15    0F    Alt-15	Ctrl-O ^O
Packit cdaae3
DLE Data line escape 16   10    Alt-16  Ctrl-P ^P
Packit cdaae3
DC1 Device control 1 17   11    Alt-17  Ctrl-Q ^Q
Packit cdaae3
DC2 Device control 2 18   12    Alt-18  Ctrl-R ^R
Packit cdaae3
DC3 Device control 3 19   13    Alt-19  Ctrl-S ^S
Packit cdaae3
DC4 Device control 4 20   14    Alt-20  Ctrl-T ^T
Packit cdaae3
NAK Negative acknowledge 21 15  Alt-21  Ctrl-U ^U
Packit cdaae3
SYN Synchronous idle 22   16    Alt-22  Ctrl-V ^V
Packit cdaae3
ETB End transmission block 23 17 Alt-23 Ctrl-W ^W
Packit cdaae3
CAN Cancel              24 18   Alt-24  Ctrl-X ^X
Packit cdaae3
EM  End of medium       25 19   Alt-25  Ctrl-Y ^Y
Packit cdaae3
SU  Substitute          26 1A   Alt-26  Ctrl-Z ^Z
Packit cdaae3
ES  Escape              27 1B   Alt-27  Ctrl-[ ^[
Packit cdaae3
FS  File separator      28 1C   Alt-28  Ctrl-\ ^\
Packit cdaae3
GS  Group separator     29 1D   Alt-29  Ctrl-] ^]
Packit cdaae3
RS  Record separator    30 1E   Alt-30  Ctrl-^ ^^
Packit cdaae3
US  Unit separator      31 1F   Alt-31  Ctrl-_ ^_
Packit cdaae3
Packit cdaae3
In addition,  characters decimal 141, 157, 127,128, 129
Packit cdaae3
143,144,157
Packit cdaae3
appear to be questionable too.
Packit cdaae3
Not in iso-8859-1 nor in html character entities list.
Packit cdaae3
Packit cdaae3
We translate all strings with a % to do sanitizing and
Packit cdaae3
we change a literal ASCII '%' char to %27 so readers
Packit cdaae3
know any % is a sanitized char. We could double up
Packit cdaae3
a % into %% on output, but switching to %27 is simpler
Packit cdaae3
and for readers and prevents ambiguity.
Packit cdaae3
Packit cdaae3
Since we do not handle utf-8 properly nor detect it
Packit cdaae3
we turn all non-ASCII to %xx below.
Packit cdaae3
*/
Packit cdaae3
Packit cdaae3
static struct esb_s localesb = {0,0,0};
Packit cdaae3
Packit cdaae3
#define FALSE 0
Packit cdaae3
#define TRUE 1
Packit cdaae3
boolean no_sanitize_string_garbage = FALSE;
Packit cdaae3
Packit cdaae3
/*  This is safe to use because it is only
Packit cdaae3
    callable here and we copy the value
Packit cdaae3
    returned in the static buffer
Packit cdaae3
    to a safe spot immediately. */
Packit cdaae3
static const char *
Packit cdaae3
as_number(int c)
Packit cdaae3
{
Packit cdaae3
    static char tmpbuf[4];
Packit cdaae3
    snprintf(tmpbuf,sizeof(tmpbuf),"%%%02x",c & 0xff);
Packit cdaae3
    return tmpbuf;
Packit cdaae3
}
Packit cdaae3
Packit cdaae3
/*  do_sanity_insert() and no_questionable_chars()
Packit cdaae3
    absolutely must have the same idea of
Packit cdaae3
    questionable characters.  Be Careful.  */
Packit cdaae3
static void
Packit cdaae3
do_sanity_insert( const char *s,struct esb_s *mesb)
Packit cdaae3
{
Packit cdaae3
    const char *cp = s;
Packit cdaae3
Packit cdaae3
    for( ; *cp; cp++) {
Packit cdaae3
        unsigned c = *cp & 0xff ;
Packit cdaae3
Packit cdaae3
        if (c >= 0x20 && c <=0x7e) {
Packit cdaae3
            /* Usual case, ASCII printable characters. */
Packit cdaae3
            esb_appendn(mesb,cp,1);
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
        if (c == '%') {
Packit cdaae3
            /* %xx for this too. Simple and unambiguous */
Packit cdaae3
            esb_append(mesb,as_number(c));
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
#ifdef _WIN32
Packit cdaae3
        if (c == 0x0D) {
Packit cdaae3
            esb_appendn(mesb,cp,1);
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
#endif /* _WIN32 */
Packit cdaae3
        if (c < 0x20) {
Packit cdaae3
            esb_append(mesb,as_number(c));
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
        if (c >= 0x7f) {
Packit cdaae3
            /* ISO-8859 or UTF-8. Not handled well yet. */
Packit cdaae3
            esb_append(mesb,as_number(c));
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
        esb_appendn(mesb,cp,1);
Packit cdaae3
    }
Packit cdaae3
}
Packit cdaae3
Packit cdaae3
/*  This routine improves overall dwarfdump
Packit cdaae3
    run times a lot by separating strings
Packit cdaae3
    that might print badly from strings that
Packit cdaae3
    will print fine.
Packit cdaae3
    In one large test case it reduces run time
Packit cdaae3
    from 140 seconds to 13 seconds. */
Packit cdaae3
static int
Packit cdaae3
no_questionable_chars(const char *s) {
Packit cdaae3
    const char *cp = s;
Packit cdaae3
Packit cdaae3
    for( ; *cp; cp++) {
Packit cdaae3
        unsigned c = *cp & 0xff ;
Packit cdaae3
        if (c >= 0x20 && c <=0x7e) {
Packit cdaae3
            /* Usual case, ASCII printable characters */
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
#ifdef _WIN32
Packit cdaae3
        if (c == 0x0D) {
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
#endif /* _WIN32 */
Packit cdaae3
        if (c == 0x0A || c == 0x09 ) {
Packit cdaae3
            continue;
Packit cdaae3
        }
Packit cdaae3
        if (c == '%') {
Packit cdaae3
            /* Always sanitize a % ASCII char. */
Packit cdaae3
            return FALSE;
Packit cdaae3
        }
Packit cdaae3
        if (c < 0x20) {
Packit cdaae3
            return FALSE;
Packit cdaae3
        }
Packit cdaae3
        if (c >= 0x7f) {
Packit cdaae3
            /*  This notices iso-8859 and UTF-8
Packit cdaae3
                data as we don't deal with them
Packit cdaae3
                properly in dwarfdump. */
Packit cdaae3
            return FALSE;
Packit cdaae3
        }
Packit cdaae3
    }
Packit cdaae3
    return TRUE;
Packit cdaae3
}
Packit cdaae3
Packit cdaae3
void
Packit cdaae3
sanitized_string_destructor(void)
Packit cdaae3
{
Packit cdaae3
    esb_destructor(&localesb);
Packit cdaae3
}
Packit cdaae3
Packit cdaae3
const char *
Packit cdaae3
sanitized(const char *s)
Packit cdaae3
{
Packit cdaae3
    const char *sout = 0;
Packit cdaae3
Packit cdaae3
    if (no_sanitize_string_garbage) {
Packit cdaae3
        return s;
Packit cdaae3
    }
Packit cdaae3
    if (no_questionable_chars(s)) {
Packit cdaae3
        /*  The original string is safe as is. */
Packit cdaae3
        return s;
Packit cdaae3
    }
Packit cdaae3
    /*  Using esb_destructor is quite expensive in cpu time
Packit cdaae3
        when we build the next sanitized string
Packit cdaae3
        so we just empty the localesb.
Packit cdaae3
        One reason it's expensive is that we do the appends
Packit cdaae3
        in such small batches in do_sanity-insert().
Packit cdaae3
        */
Packit cdaae3
    esb_empty_string(&localesb);
Packit cdaae3
    do_sanity_insert(s,&localesb);
Packit cdaae3
    sout = esb_get_string(&localesb);
Packit cdaae3
    return sout;
Packit cdaae3
}