json-lexer.c (10585B)
1/* 2 * JSON lexer 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. 10 * See the COPYING.LIB file in the top-level directory. 11 * 12 */ 13 14#include "qemu/osdep.h" 15#include "json-parser-int.h" 16 17#define MAX_TOKEN_SIZE (64ULL << 20) 18 19/* 20 * From RFC 8259 "The JavaScript Object Notation (JSON) Data 21 * Interchange Format", with [comments in brackets]: 22 * 23 * The set of tokens includes six structural characters, strings, 24 * numbers, and three literal names. 25 * 26 * These are the six structural characters: 27 * 28 * begin-array = ws %x5B ws ; [ left square bracket 29 * begin-object = ws %x7B ws ; { left curly bracket 30 * end-array = ws %x5D ws ; ] right square bracket 31 * end-object = ws %x7D ws ; } right curly bracket 32 * name-separator = ws %x3A ws ; : colon 33 * value-separator = ws %x2C ws ; , comma 34 * 35 * Insignificant whitespace is allowed before or after any of the six 36 * structural characters. 37 * [This lexer accepts it before or after any token, which is actually 38 * the same, as the grammar always has structural characters between 39 * other tokens.] 40 * 41 * ws = *( 42 * %x20 / ; Space 43 * %x09 / ; Horizontal tab 44 * %x0A / ; Line feed or New line 45 * %x0D ) ; Carriage return 46 * 47 * [...] three literal names: 48 * false null true 49 * [This lexer accepts [a-z]+, and leaves rejecting unknown literal 50 * names to the parser.] 51 * 52 * [Numbers:] 53 * 54 * number = [ minus ] int [ frac ] [ exp ] 55 * decimal-point = %x2E ; . 56 * digit1-9 = %x31-39 ; 1-9 57 * e = %x65 / %x45 ; e E 58 * exp = e [ minus / plus ] 1*DIGIT 59 * frac = decimal-point 1*DIGIT 60 * int = zero / ( digit1-9 *DIGIT ) 61 * minus = %x2D ; - 62 * plus = %x2B ; + 63 * zero = %x30 ; 0 64 * 65 * [Strings:] 66 * string = quotation-mark *char quotation-mark 67 * 68 * char = unescaped / 69 * escape ( 70 * %x22 / ; " quotation mark U+0022 71 * %x5C / ; \ reverse solidus U+005C 72 * %x2F / ; / solidus U+002F 73 * %x62 / ; b backspace U+0008 74 * %x66 / ; f form feed U+000C 75 * %x6E / ; n line feed U+000A 76 * %x72 / ; r carriage return U+000D 77 * %x74 / ; t tab U+0009 78 * %x75 4HEXDIG ) ; uXXXX U+XXXX 79 * escape = %x5C ; \ 80 * quotation-mark = %x22 ; " 81 * unescaped = %x20-21 / %x23-5B / %x5D-10FFFF 82 * [This lexer accepts any non-control character after escape, and 83 * leaves rejecting invalid ones to the parser.] 84 * 85 * 86 * Extensions over RFC 8259: 87 * - Extra escape sequence in strings: 88 * 0x27 (apostrophe) is recognized after escape, too 89 * - Single-quoted strings: 90 * Like double-quoted strings, except they're delimited by %x27 91 * (apostrophe) instead of %x22 (quotation mark), and can't contain 92 * unescaped apostrophe, but can contain unescaped quotation mark. 93 * - Interpolation, if enabled: 94 * The lexer accepts %[A-Za-z0-9]*, and leaves rejecting invalid 95 * ones to the parser. 96 * 97 * Note: 98 * - Input must be encoded in modified UTF-8. 99 * - Decoding and validating is left to the parser. 100 */ 101 102enum json_lexer_state { 103 IN_RECOVERY = 1, 104 IN_DQ_STRING_ESCAPE, 105 IN_DQ_STRING, 106 IN_SQ_STRING_ESCAPE, 107 IN_SQ_STRING, 108 IN_ZERO, 109 IN_EXP_DIGITS, 110 IN_EXP_SIGN, 111 IN_EXP_E, 112 IN_MANTISSA, 113 IN_MANTISSA_DIGITS, 114 IN_DIGITS, 115 IN_SIGN, 116 IN_KEYWORD, 117 IN_INTERP, 118 IN_START, 119 IN_START_INTERP, /* must be IN_START + 1 */ 120}; 121 122QEMU_BUILD_BUG_ON(JSON_ERROR != 0); 123QEMU_BUILD_BUG_ON(IN_RECOVERY != JSON_ERROR + 1); 124QEMU_BUILD_BUG_ON((int)JSON_MIN <= (int)IN_START_INTERP); 125QEMU_BUILD_BUG_ON(JSON_MAX >= 0x80); 126QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1); 127 128#define LOOKAHEAD 0x80 129#define TERMINAL(state) [0 ... 0xFF] = ((state) | LOOKAHEAD) 130 131static const uint8_t json_lexer[][256] = { 132 /* Relies on default initialization to IN_ERROR! */ 133 134 /* error recovery */ 135 [IN_RECOVERY] = { 136 /* 137 * Skip characters until a structural character, an ASCII 138 * control character other than '\t', or impossible UTF-8 139 * bytes '\xFE', '\xFF'. Structural characters and line 140 * endings are promising resynchronization points. Clients 141 * may use the others to force the JSON parser into known-good 142 * state; see docs/interop/qmp-spec.txt. 143 */ 144 [0 ... 0x1F] = IN_START | LOOKAHEAD, 145 [0x20 ... 0xFD] = IN_RECOVERY, 146 [0xFE ... 0xFF] = IN_START | LOOKAHEAD, 147 ['\t'] = IN_RECOVERY, 148 ['['] = IN_START | LOOKAHEAD, 149 [']'] = IN_START | LOOKAHEAD, 150 ['{'] = IN_START | LOOKAHEAD, 151 ['}'] = IN_START | LOOKAHEAD, 152 [':'] = IN_START | LOOKAHEAD, 153 [','] = IN_START | LOOKAHEAD, 154 }, 155 156 /* double quote string */ 157 [IN_DQ_STRING_ESCAPE] = { 158 [0x20 ... 0xFD] = IN_DQ_STRING, 159 }, 160 [IN_DQ_STRING] = { 161 [0x20 ... 0xFD] = IN_DQ_STRING, 162 ['\\'] = IN_DQ_STRING_ESCAPE, 163 ['"'] = JSON_STRING, 164 }, 165 166 /* single quote string */ 167 [IN_SQ_STRING_ESCAPE] = { 168 [0x20 ... 0xFD] = IN_SQ_STRING, 169 }, 170 [IN_SQ_STRING] = { 171 [0x20 ... 0xFD] = IN_SQ_STRING, 172 ['\\'] = IN_SQ_STRING_ESCAPE, 173 ['\''] = JSON_STRING, 174 }, 175 176 /* Zero */ 177 [IN_ZERO] = { 178 TERMINAL(JSON_INTEGER), 179 ['0' ... '9'] = JSON_ERROR, 180 ['.'] = IN_MANTISSA, 181 }, 182 183 /* Float */ 184 [IN_EXP_DIGITS] = { 185 TERMINAL(JSON_FLOAT), 186 ['0' ... '9'] = IN_EXP_DIGITS, 187 }, 188 189 [IN_EXP_SIGN] = { 190 ['0' ... '9'] = IN_EXP_DIGITS, 191 }, 192 193 [IN_EXP_E] = { 194 ['-'] = IN_EXP_SIGN, 195 ['+'] = IN_EXP_SIGN, 196 ['0' ... '9'] = IN_EXP_DIGITS, 197 }, 198 199 [IN_MANTISSA_DIGITS] = { 200 TERMINAL(JSON_FLOAT), 201 ['0' ... '9'] = IN_MANTISSA_DIGITS, 202 ['e'] = IN_EXP_E, 203 ['E'] = IN_EXP_E, 204 }, 205 206 [IN_MANTISSA] = { 207 ['0' ... '9'] = IN_MANTISSA_DIGITS, 208 }, 209 210 /* Number */ 211 [IN_DIGITS] = { 212 TERMINAL(JSON_INTEGER), 213 ['0' ... '9'] = IN_DIGITS, 214 ['e'] = IN_EXP_E, 215 ['E'] = IN_EXP_E, 216 ['.'] = IN_MANTISSA, 217 }, 218 219 [IN_SIGN] = { 220 ['0'] = IN_ZERO, 221 ['1' ... '9'] = IN_DIGITS, 222 }, 223 224 /* keywords */ 225 [IN_KEYWORD] = { 226 TERMINAL(JSON_KEYWORD), 227 ['a' ... 'z'] = IN_KEYWORD, 228 }, 229 230 /* interpolation */ 231 [IN_INTERP] = { 232 TERMINAL(JSON_INTERP), 233 ['A' ... 'Z'] = IN_INTERP, 234 ['a' ... 'z'] = IN_INTERP, 235 ['0' ... '9'] = IN_INTERP, 236 }, 237 238 /* 239 * Two start states: 240 * - IN_START recognizes JSON tokens with our string extensions 241 * - IN_START_INTERP additionally recognizes interpolation. 242 */ 243 [IN_START ... IN_START_INTERP] = { 244 ['"'] = IN_DQ_STRING, 245 ['\''] = IN_SQ_STRING, 246 ['0'] = IN_ZERO, 247 ['1' ... '9'] = IN_DIGITS, 248 ['-'] = IN_SIGN, 249 ['{'] = JSON_LCURLY, 250 ['}'] = JSON_RCURLY, 251 ['['] = JSON_LSQUARE, 252 [']'] = JSON_RSQUARE, 253 [','] = JSON_COMMA, 254 [':'] = JSON_COLON, 255 ['a' ... 'z'] = IN_KEYWORD, 256 [' '] = IN_START, 257 ['\t'] = IN_START, 258 ['\r'] = IN_START, 259 ['\n'] = IN_START, 260 }, 261 [IN_START_INTERP]['%'] = IN_INTERP, 262}; 263 264static inline uint8_t next_state(JSONLexer *lexer, char ch, bool flush, 265 bool *char_consumed) 266{ 267 uint8_t next; 268 269 assert(lexer->state < ARRAY_SIZE(json_lexer)); 270 next = json_lexer[lexer->state][(uint8_t)ch]; 271 *char_consumed = !flush && !(next & LOOKAHEAD); 272 return next & ~LOOKAHEAD; 273} 274 275void json_lexer_init(JSONLexer *lexer, bool enable_interpolation) 276{ 277 lexer->start_state = lexer->state = enable_interpolation 278 ? IN_START_INTERP : IN_START; 279 lexer->token = g_string_sized_new(3); 280 lexer->x = lexer->y = 0; 281} 282 283static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush) 284{ 285 int new_state; 286 bool char_consumed = false; 287 288 lexer->x++; 289 if (ch == '\n') { 290 lexer->x = 0; 291 lexer->y++; 292 } 293 294 while (flush ? lexer->state != lexer->start_state : !char_consumed) { 295 new_state = next_state(lexer, ch, flush, &char_consumed); 296 if (char_consumed) { 297 assert(!flush); 298 g_string_append_c(lexer->token, ch); 299 } 300 301 switch (new_state) { 302 case JSON_LCURLY: 303 case JSON_RCURLY: 304 case JSON_LSQUARE: 305 case JSON_RSQUARE: 306 case JSON_COLON: 307 case JSON_COMMA: 308 case JSON_INTERP: 309 case JSON_INTEGER: 310 case JSON_FLOAT: 311 case JSON_KEYWORD: 312 case JSON_STRING: 313 json_message_process_token(lexer, lexer->token, new_state, 314 lexer->x, lexer->y); 315 /* fall through */ 316 case IN_START: 317 g_string_truncate(lexer->token, 0); 318 new_state = lexer->start_state; 319 break; 320 case JSON_ERROR: 321 json_message_process_token(lexer, lexer->token, JSON_ERROR, 322 lexer->x, lexer->y); 323 new_state = IN_RECOVERY; 324 /* fall through */ 325 case IN_RECOVERY: 326 g_string_truncate(lexer->token, 0); 327 break; 328 default: 329 break; 330 } 331 lexer->state = new_state; 332 } 333 334 /* Do not let a single token grow to an arbitrarily large size, 335 * this is a security consideration. 336 */ 337 if (lexer->token->len > MAX_TOKEN_SIZE) { 338 json_message_process_token(lexer, lexer->token, lexer->state, 339 lexer->x, lexer->y); 340 g_string_truncate(lexer->token, 0); 341 lexer->state = lexer->start_state; 342 } 343} 344 345void json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size) 346{ 347 size_t i; 348 349 for (i = 0; i < size; i++) { 350 json_lexer_feed_char(lexer, buffer[i], false); 351 } 352} 353 354void json_lexer_flush(JSONLexer *lexer) 355{ 356 json_lexer_feed_char(lexer, 0, true); 357 assert(lexer->state == lexer->start_state); 358 json_message_process_token(lexer, lexer->token, JSON_END_OF_INPUT, 359 lexer->x, lexer->y); 360} 361 362void json_lexer_destroy(JSONLexer *lexer) 363{ 364 g_string_free(lexer->token, true); 365}