lws-tokenize.h (10780B)
1/* 2 * libwebsockets - small server side websockets and web server implementation 3 * 4 * Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25/* Do not treat - as a terminal character, so "my-token" is one token */ 26#define LWS_TOKENIZE_F_MINUS_NONTERM (1 << 0) 27/* Separately report aggregate colon-delimited tokens */ 28#define LWS_TOKENIZE_F_AGG_COLON (1 << 1) 29/* Enforce sequencing for a simple token , token , token ... list */ 30#define LWS_TOKENIZE_F_COMMA_SEP_LIST (1 << 2) 31/* Allow more characters in the tokens and less delimiters... default is 32 * only alphanumeric + underscore in tokens */ 33#define LWS_TOKENIZE_F_RFC7230_DELIMS (1 << 3) 34/* Do not treat . as a terminal character, so "warmcat.com" is one token */ 35#define LWS_TOKENIZE_F_DOT_NONTERM (1 << 4) 36/* If something starts looking like a float, like 1.2, force to be string token. 37 * This lets you receive dotted-quads like 192.168.0.1 as string tokens, and 38 * avoids illegal float format detection like 1.myserver.com */ 39#define LWS_TOKENIZE_F_NO_FLOATS (1 << 5) 40/* Instead of LWS_TOKZE_INTEGER, report integers as any other string token */ 41#define LWS_TOKENIZE_F_NO_INTEGERS (1 << 6) 42/* # makes the rest of the line a comment */ 43#define LWS_TOKENIZE_F_HASH_COMMENT (1 << 7) 44/* Do not treat / as a terminal character, so "multipart/related" is one token */ 45#define LWS_TOKENIZE_F_SLASH_NONTERM (1 << 8) 46/* Do not treat * as a terminal character, so "myfile*" is one token */ 47#define LWS_TOKENIZE_F_ASTERISK_NONTERM (1 << 9) 48/* Do not treat = as a terminal character, so "x=y" is one token */ 49#define LWS_TOKENIZE_F_EQUALS_NONTERM (1 << 10) 50 51typedef enum { 52 53 LWS_TOKZE_ERRS = 5, /* the number of errors defined */ 54 55 LWS_TOKZE_ERR_BROKEN_UTF8 = -5, /* malformed or partial utf8 */ 56 LWS_TOKZE_ERR_UNTERM_STRING = -4, /* ended while we were in "" */ 57 LWS_TOKZE_ERR_MALFORMED_FLOAT = -3, /* like 0..1 or 0.1.1 */ 58 LWS_TOKZE_ERR_NUM_ON_LHS = -2, /* like 123= or 0.1= */ 59 LWS_TOKZE_ERR_COMMA_LIST = -1, /* like ",tok", or, "tok,," */ 60 61 LWS_TOKZE_ENDED = 0, /* no more content */ 62 63 /* Note: results have ordinal 1+, EOT is 0 and errors are < 0 */ 64 65 LWS_TOKZE_DELIMITER, /* a delimiter appeared */ 66 LWS_TOKZE_TOKEN, /* a token appeared */ 67 LWS_TOKZE_INTEGER, /* an integer appeared */ 68 LWS_TOKZE_FLOAT, /* a float appeared */ 69 LWS_TOKZE_TOKEN_NAME_EQUALS, /* token [whitespace] = */ 70 LWS_TOKZE_TOKEN_NAME_COLON, /* token [whitespace] : (only with 71 LWS_TOKENIZE_F_AGG_COLON flag) */ 72 LWS_TOKZE_QUOTED_STRING, /* "*", where * may have any char */ 73 74} lws_tokenize_elem; 75 76/* 77 * helper enums to allow caller to enforce legal delimiter sequencing, eg 78 * disallow "token,,token", "token,", and ",token" 79 */ 80 81enum lws_tokenize_delimiter_tracking { 82 LWSTZ_DT_NEED_FIRST_CONTENT, 83 LWSTZ_DT_NEED_DELIM, 84 LWSTZ_DT_NEED_NEXT_CONTENT, 85}; 86 87typedef struct lws_tokenize { 88 const char *start; /**< set to the start of the string to tokenize */ 89 const char *token; /**< the start of an identified token or delimiter */ 90 size_t len; /**< set to the length of the string to tokenize */ 91 size_t token_len; /**< the length of the identied token or delimiter */ 92 93 uint16_t flags; /**< optional LWS_TOKENIZE_F_ flags, or 0 */ 94 uint8_t delim; 95 96 int8_t e; /**< convenient for storing lws_tokenize return */ 97} lws_tokenize_t; 98 99/** 100 * lws_tokenize() - breaks down a string into tokens and delimiters in-place 101 * 102 * \param ts: the lws_tokenize struct to init 103 * \param start: the string to tokenize 104 * \param flags: LWS_TOKENIZE_F_ option flags 105 * 106 * This initializes the tokenize struct to point to the given string, and 107 * sets the length to 2GiB - 1 (so there must be a terminating NUL)... you can 108 * override this requirement by setting ts.len yourself before using it. 109 * 110 * .delim is also initialized to LWSTZ_DT_NEED_FIRST_CONTENT. 111 */ 112 113LWS_VISIBLE LWS_EXTERN void 114lws_tokenize_init(struct lws_tokenize *ts, const char *start, int flags); 115 116/** 117 * lws_tokenize() - breaks down a string into tokens and delimiters in-place 118 * 119 * \param ts: the lws_tokenize struct with information and state on what to do 120 * 121 * The \p ts struct should have its start, len and flags members initialized to 122 * reflect the string to be tokenized and any options. 123 * 124 * Then `lws_tokenize()` may be called repeatedly on the struct, returning one 125 * of `lws_tokenize_elem` each time, and with the struct's `token` and 126 * `token_len` members set to describe the content of the delimiter or token 127 * payload each time. 128 * 129 * There are no allocations during the process. 130 * 131 * returns lws_tokenize_elem that was identified (LWS_TOKZE_ENDED means reached 132 * the end of the string). 133 */ 134 135LWS_VISIBLE LWS_EXTERN lws_tokenize_elem 136lws_tokenize(struct lws_tokenize *ts); 137 138/** 139 * lws_tokenize_cstr() - copy token string to NUL-terminated buffer 140 * 141 * \param ts: pointer to lws_tokenize struct to operate on 142 * \param str: destination buffer 143 * \pparam max: bytes in destination buffer 144 * 145 * returns 0 if OK or nonzero if the string + NUL won't fit. 146 */ 147 148LWS_VISIBLE LWS_EXTERN int 149lws_tokenize_cstr(struct lws_tokenize *ts, char *str, size_t max); 150 151 152/* 153 * lws_strexp: flexible string expansion helper api 154 * 155 * This stateful helper can handle multiple separate input chunks and multiple 156 * output buffer loads with arbitrary boundaries between literals and expanded 157 * symbols. This allows it to handle fragmented input as well as arbitrarily 158 * long symbol expansions that are bigger than the output buffer itself. 159 * 160 * A user callback is used to convert symbol names to the symbol value. 161 * 162 * A single byte buffer for input and another for output can process any 163 * length substitution then. The state object is around 64 bytes on a 64-bit 164 * system and it only uses 8 bytes stack. 165 */ 166 167 168typedef int (*lws_strexp_expand_cb)(void *priv, const char *name, char *out, 169 size_t *pos, size_t olen, size_t *exp_ofs); 170 171typedef struct lws_strexp { 172 char name[32]; 173 lws_strexp_expand_cb cb; 174 void *priv; 175 char *out; 176 size_t olen; 177 size_t pos; 178 179 size_t exp_ofs; 180 181 uint8_t name_pos; 182 char state; 183} lws_strexp_t; 184 185enum { 186 LSTRX_DONE, /* it completed OK */ 187 LSTRX_FILLED_OUT, /* out buf filled and needs resetting */ 188 LSTRX_FATAL_NAME_TOO_LONG = -1, /* fatal */ 189 LSTRX_FATAL_NAME_UNKNOWN = -2, 190}; 191 192 193/** 194 * lws_strexp_init() - initialize an lws_strexp_t for use 195 * 196 * \p exp: the exp object to init 197 * \p priv: the user's object pointer to pass to callback 198 * \p cb: the callback to expand named objects 199 * \p out: the start of the output buffer, or NULL just to get the length 200 * \p olen: the length of the output buffer in bytes 201 * 202 * Prepares an lws_strexp_t for use and sets the initial output buffer 203 * 204 * If \p out is NULL, substitution proceeds normally, but no output is produced, 205 * only the length is returned. olen should be set to the largest feasible 206 * overall length. To use this mode, the substitution callback must also check 207 * for NULL \p out and avoid producing the output. 208 */ 209LWS_VISIBLE LWS_EXTERN void 210lws_strexp_init(lws_strexp_t *exp, void *priv, lws_strexp_expand_cb cb, 211 char *out, size_t olen); 212 213/** 214 * lws_strexp_reset_out() - reset the output buffer on an existing strexp 215 * 216 * \p exp: the exp object to init 217 * \p out: the start of the output buffer, or NULL to just get length 218 * \p olen: the length of the output buffer in bytes 219 * 220 * Provides a new output buffer for lws_strexp_expand() to continue to write 221 * into. It can be the same as the old one if it has been copied out or used. 222 * The position of the next write will be reset to the start of the given buf. 223 * 224 * If \p out is NULL, substitution proceeds normally, but no output is produced, 225 * only the length is returned. \p olen should be set to the largest feasible 226 * overall length. To use this mode, the substitution callback must also check 227 * for NULL \p out and avoid producing the output. 228 */ 229LWS_VISIBLE LWS_EXTERN void 230lws_strexp_reset_out(lws_strexp_t *exp, char *out, size_t olen); 231 232/** 233 * lws_strexp_expand() - copy / expand a string into the output buffer 234 * 235 * \p exp: the exp object for the copy / expansion 236 * \p in: the start of the next input data 237 * \p len: the length of the input data 238 * \p pused_in: pointer to write the amount of input used 239 * \p pused_out: pointer to write the amount of output used 240 * 241 * Copies in to the output buffer set in exp, expanding any ${name} tokens using 242 * the callback. \p *pused_in is set to the number of input chars used and 243 * \p *pused_out the number of output characters used 244 * 245 * May return LSTRX_FILLED_OUT early with *pused < len if the output buffer is 246 * filled. Handle the output buffer and reset it with lws_strexp_reset_out() 247 * before calling again with adjusted in / len to continue. 248 * 249 * In the case of large expansions, the expansion itself may fill the output 250 * buffer, in which case the expansion callback returns the LSTRX_FILLED_OUT 251 * and will be called again to continue with its *exp_ofs parameter set 252 * appropriately. 253 */ 254LWS_VISIBLE LWS_EXTERN int 255lws_strexp_expand(lws_strexp_t *exp, const char *in, size_t len, 256 size_t *pused_in, size_t *pused_out); 257 258/** 259 * lws_strcmp_wildcard() - strcmp but the first arg can have wildcards 260 * 261 * \p wildcard: a string that may contain zero to three *, and may lack a NUL 262 * \p wlen: length of the wildcard string 263 * \p check: string to test to see if it matches wildcard 264 * \p clen: length of check string 265 * 266 * Like strcmp, but supports patterns like "a*", "a*b", "a*b*" etc 267 * where a and b are arbitrary substrings. Both the wc and check strings need 268 * not be NUL terminated, but are specified by lengths. 269 */ 270LWS_VISIBLE LWS_EXTERN int 271lws_strcmp_wildcard(const char *wildcard, size_t wlen, const char *check, 272 size_t clen);