cachepc-qemu

Fork of AMDESE/qemu with changes for cachepc side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-qemu
Log | Files | Refs | Submodules | LICENSE | sfeed.txt

json-lexer.c (10585B)


      1/*
      2 * JSON lexer
      3 *
      4 * Copyright IBM, Corp. 2009
      5 *
      6 * Authors:
      7 *  Anthony Liguori   <aliguori@us.ibm.com>
      8 *
      9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
     10 * See the COPYING.LIB file in the top-level directory.
     11 *
     12 */
     13
     14#include "qemu/osdep.h"
     15#include "json-parser-int.h"
     16
     17#define MAX_TOKEN_SIZE (64ULL << 20)
     18
     19/*
     20 * From RFC 8259 "The JavaScript Object Notation (JSON) Data
     21 * Interchange Format", with [comments in brackets]:
     22 *
     23 * The set of tokens includes six structural characters, strings,
     24 * numbers, and three literal names.
     25 *
     26 * These are the six structural characters:
     27 *
     28 *    begin-array     = ws %x5B ws  ; [ left square bracket
     29 *    begin-object    = ws %x7B ws  ; { left curly bracket
     30 *    end-array       = ws %x5D ws  ; ] right square bracket
     31 *    end-object      = ws %x7D ws  ; } right curly bracket
     32 *    name-separator  = ws %x3A ws  ; : colon
     33 *    value-separator = ws %x2C ws  ; , comma
     34 *
     35 * Insignificant whitespace is allowed before or after any of the six
     36 * structural characters.
     37 * [This lexer accepts it before or after any token, which is actually
     38 * the same, as the grammar always has structural characters between
     39 * other tokens.]
     40 *
     41 *    ws = *(
     42 *           %x20 /              ; Space
     43 *           %x09 /              ; Horizontal tab
     44 *           %x0A /              ; Line feed or New line
     45 *           %x0D )              ; Carriage return
     46 *
     47 * [...] three literal names:
     48 *    false null true
     49 *  [This lexer accepts [a-z]+, and leaves rejecting unknown literal
     50 *  names to the parser.]
     51 *
     52 * [Numbers:]
     53 *
     54 *    number = [ minus ] int [ frac ] [ exp ]
     55 *    decimal-point = %x2E       ; .
     56 *    digit1-9 = %x31-39         ; 1-9
     57 *    e = %x65 / %x45            ; e E
     58 *    exp = e [ minus / plus ] 1*DIGIT
     59 *    frac = decimal-point 1*DIGIT
     60 *    int = zero / ( digit1-9 *DIGIT )
     61 *    minus = %x2D               ; -
     62 *    plus = %x2B                ; +
     63 *    zero = %x30                ; 0
     64 *
     65 * [Strings:]
     66 *    string = quotation-mark *char quotation-mark
     67 *
     68 *    char = unescaped /
     69 *        escape (
     70 *            %x22 /          ; "    quotation mark  U+0022
     71 *            %x5C /          ; \    reverse solidus U+005C
     72 *            %x2F /          ; /    solidus         U+002F
     73 *            %x62 /          ; b    backspace       U+0008
     74 *            %x66 /          ; f    form feed       U+000C
     75 *            %x6E /          ; n    line feed       U+000A
     76 *            %x72 /          ; r    carriage return U+000D
     77 *            %x74 /          ; t    tab             U+0009
     78 *            %x75 4HEXDIG )  ; uXXXX                U+XXXX
     79 *    escape = %x5C              ; \
     80 *    quotation-mark = %x22      ; "
     81 *    unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
     82 *    [This lexer accepts any non-control character after escape, and
     83 *    leaves rejecting invalid ones to the parser.]
     84 *
     85 *
     86 * Extensions over RFC 8259:
     87 * - Extra escape sequence in strings:
     88 *   0x27 (apostrophe) is recognized after escape, too
     89 * - Single-quoted strings:
     90 *   Like double-quoted strings, except they're delimited by %x27
     91 *   (apostrophe) instead of %x22 (quotation mark), and can't contain
     92 *   unescaped apostrophe, but can contain unescaped quotation mark.
     93 * - Interpolation, if enabled:
     94 *   The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid
     95 *   ones to the parser.
     96 *
     97 * Note:
     98 * - Input must be encoded in modified UTF-8.
     99 * - Decoding and validating is left to the parser.
    100 */
    101
    102enum json_lexer_state {
    103    IN_RECOVERY = 1,
    104    IN_DQ_STRING_ESCAPE,
    105    IN_DQ_STRING,
    106    IN_SQ_STRING_ESCAPE,
    107    IN_SQ_STRING,
    108    IN_ZERO,
    109    IN_EXP_DIGITS,
    110    IN_EXP_SIGN,
    111    IN_EXP_E,
    112    IN_MANTISSA,
    113    IN_MANTISSA_DIGITS,
    114    IN_DIGITS,
    115    IN_SIGN,
    116    IN_KEYWORD,
    117    IN_INTERP,
    118    IN_START,
    119    IN_START_INTERP,            /* must be IN_START + 1 */
    120};
    121
    122QEMU_BUILD_BUG_ON(JSON_ERROR != 0);
    123QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1);
    124QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP);
    125QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80);
    126QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
    127
    128#define LOOKAHEAD 0x80
    129#define TERMINAL(state) [0 ... 0xFF] = ((state) | LOOKAHEAD)
    130
    131static const uint8_t json_lexer[][256] =  {
    132    /* Relies on default initialization to IN_ERROR! */
    133
    134    /* error recovery */
    135    [IN_RECOVERY] = {
    136        /*
    137         * Skip characters until a structural character, an ASCII
    138         * control character other than '\t', or impossible UTF-8
    139         * bytes '\xFE', '\xFF'.  Structural characters and line
    140         * endings are promising resynchronization points.  Clients
    141         * may use the others to force the JSON parser into known-good
    142         * state; see docs/interop/qmp-spec.txt.
    143         */
    144        [0 ... 0x1F] = IN_START | LOOKAHEAD,
    145        [0x20 ... 0xFD] = IN_RECOVERY,
    146        [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
    147        ['\t'] = IN_RECOVERY,
    148        ['['] = IN_START | LOOKAHEAD,
    149        [']'] = IN_START | LOOKAHEAD,
    150        ['{'] = IN_START | LOOKAHEAD,
    151        ['}'] = IN_START | LOOKAHEAD,
    152        [':'] = IN_START | LOOKAHEAD,
    153        [','] = IN_START | LOOKAHEAD,
    154    },
    155
    156    /* double quote string */
    157    [IN_DQ_STRING_ESCAPE] = {
    158        [0x20 ... 0xFD] = IN_DQ_STRING,
    159    },
    160    [IN_DQ_STRING] = {
    161        [0x20 ... 0xFD] = IN_DQ_STRING,
    162        ['\\'] = IN_DQ_STRING_ESCAPE,
    163        ['"'] = JSON_STRING,
    164    },
    165
    166    /* single quote string */
    167    [IN_SQ_STRING_ESCAPE] = {
    168        [0x20 ... 0xFD] = IN_SQ_STRING,
    169    },
    170    [IN_SQ_STRING] = {
    171        [0x20 ... 0xFD] = IN_SQ_STRING,
    172        ['\\'] = IN_SQ_STRING_ESCAPE,
    173        ['\''] = JSON_STRING,
    174    },
    175
    176    /* Zero */
    177    [IN_ZERO] = {
    178        TERMINAL(JSON_INTEGER),
    179        ['0' ... '9'] = JSON_ERROR,
    180        ['.'] = IN_MANTISSA,
    181    },
    182
    183    /* Float */
    184    [IN_EXP_DIGITS] = {
    185        TERMINAL(JSON_FLOAT),
    186        ['0' ... '9'] = IN_EXP_DIGITS,
    187    },
    188
    189    [IN_EXP_SIGN] = {
    190        ['0' ... '9'] = IN_EXP_DIGITS,
    191    },
    192
    193    [IN_EXP_E] = {
    194        ['-'] = IN_EXP_SIGN,
    195        ['+'] = IN_EXP_SIGN,
    196        ['0' ... '9'] = IN_EXP_DIGITS,
    197    },
    198
    199    [IN_MANTISSA_DIGITS] = {
    200        TERMINAL(JSON_FLOAT),
    201        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    202        ['e'] = IN_EXP_E,
    203        ['E'] = IN_EXP_E,
    204    },
    205
    206    [IN_MANTISSA] = {
    207        ['0' ... '9'] = IN_MANTISSA_DIGITS,
    208    },
    209
    210    /* Number */
    211    [IN_DIGITS] = {
    212        TERMINAL(JSON_INTEGER),
    213        ['0' ... '9'] = IN_DIGITS,
    214        ['e'] = IN_EXP_E,
    215        ['E'] = IN_EXP_E,
    216        ['.'] = IN_MANTISSA,
    217    },
    218
    219    [IN_SIGN] = {
    220        ['0'] = IN_ZERO,
    221        ['1' ... '9'] = IN_DIGITS,
    222    },
    223
    224    /* keywords */
    225    [IN_KEYWORD] = {
    226        TERMINAL(JSON_KEYWORD),
    227        ['a' ... 'z'] = IN_KEYWORD,
    228    },
    229
    230    /* interpolation */
    231    [IN_INTERP] = {
    232        TERMINAL(JSON_INTERP),
    233        ['A' ... 'Z'] = IN_INTERP,
    234        ['a' ... 'z'] = IN_INTERP,
    235        ['0' ... '9'] = IN_INTERP,
    236    },
    237
    238    /*
    239     * Two start states:
    240     * - IN_START recognizes JSON tokens with our string extensions
    241     * - IN_START_INTERP additionally recognizes interpolation.
    242     */
    243    [IN_START ... IN_START_INTERP] = {
    244        ['"'] = IN_DQ_STRING,
    245        ['\''] = IN_SQ_STRING,
    246        ['0'] = IN_ZERO,
    247        ['1' ... '9'] = IN_DIGITS,
    248        ['-'] = IN_SIGN,
    249        ['{'] = JSON_LCURLY,
    250        ['}'] = JSON_RCURLY,
    251        ['['] = JSON_LSQUARE,
    252        [']'] = JSON_RSQUARE,
    253        [','] = JSON_COMMA,
    254        [':'] = JSON_COLON,
    255        ['a' ... 'z'] = IN_KEYWORD,
    256        [' '] = IN_START,
    257        ['\t'] = IN_START,
    258        ['\r'] = IN_START,
    259        ['\n'] = IN_START,
    260    },
    261    [IN_START_INTERP]['%'] = IN_INTERP,
    262};
    263
    264static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush,
    265                                 bool *char_consumed)
    266{
    267    uint8_t next;
    268
    269    assert(lexer->state < ARRAY_SIZE(json_lexer));
    270    next = json_lexer[lexer->state][(uint8_t)ch];
    271    *char_consumed = !flush && !(next & LOOKAHEAD);
    272    return next & ~LOOKAHEAD;
    273}
    274
    275void json_lexer_init(JSONLexer *lexer, bool enable_interpolation)
    276{
    277    lexer->start_state = lexer->state = enable_interpolation
    278        ? IN_START_INTERP : IN_START;
    279    lexer->token = g_string_sized_new(3);
    280    lexer->x = lexer->y = 0;
    281}
    282
    283static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
    284{
    285    int new_state;
    286    bool char_consumed = false;
    287
    288    lexer->x++;
    289    if (ch == '\n') {
    290        lexer->x = 0;
    291        lexer->y++;
    292    }
    293
    294    while (flush ? lexer->state != lexer->start_state : !char_consumed) {
    295        new_state = next_state(lexer, ch, flush, &char_consumed);
    296        if (char_consumed) {
    297            assert(!flush);
    298            g_string_append_c(lexer->token, ch);
    299        }
    300
    301        switch (new_state) {
    302        case JSON_LCURLY:
    303        case JSON_RCURLY:
    304        case JSON_LSQUARE:
    305        case JSON_RSQUARE:
    306        case JSON_COLON:
    307        case JSON_COMMA:
    308        case JSON_INTERP:
    309        case JSON_INTEGER:
    310        case JSON_FLOAT:
    311        case JSON_KEYWORD:
    312        case JSON_STRING:
    313            json_message_process_token(lexer, lexer->token, new_state,
    314                                       lexer->x, lexer->y);
    315            /* fall through */
    316        case IN_START:
    317            g_string_truncate(lexer->token, 0);
    318            new_state = lexer->start_state;
    319            break;
    320        case JSON_ERROR:
    321            json_message_process_token(lexer, lexer->token, JSON_ERROR,
    322                                       lexer->x, lexer->y);
    323            new_state = IN_RECOVERY;
    324            /* fall through */
    325        case IN_RECOVERY:
    326            g_string_truncate(lexer->token, 0);
    327            break;
    328        default:
    329            break;
    330        }
    331        lexer->state = new_state;
    332    }
    333
    334    /* Do not let a single token grow to an arbitrarily large size,
    335     * this is a security consideration.
    336     */
    337    if (lexer->token->len > MAX_TOKEN_SIZE) {
    338        json_message_process_token(lexer, lexer->token, lexer->state,
    339                                   lexer->x, lexer->y);
    340        g_string_truncate(lexer->token, 0);
    341        lexer->state = lexer->start_state;
    342    }
    343}
    344
    345void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
    346{
    347    size_t i;
    348
    349    for (i = 0; i < size; i++) {
    350        json_lexer_feed_char(lexer, buffer[i], false);
    351    }
    352}
    353
    354void json_lexer_flush(JSONLexer *lexer)
    355{
    356    json_lexer_feed_char(lexer, 0, true);
    357    assert(lexer->state == lexer->start_state);
    358    json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT,
    359                               lexer->x, lexer->y);
    360}
    361
    362void json_lexer_destroy(JSONLexer *lexer)
    363{
    364    g_string_free(lexer->token, true);
    365}