sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

xml.c (10012B)


      1#include <errno.h>
      2#include <stdio.h>
      3#include <stdlib.h>
      4#include <string.h>
      5
      6#include "xml.h"
      7
      8#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
      9#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
     10
     11static void
     12xml_parseattrs(XMLParser *x)
     13{
     14	size_t namelen = 0, valuelen;
     15	int c, endsep, endname = 0, valuestart = 0;
     16
     17	while ((c = GETNEXT()) != EOF) {
     18		if (ISSPACE(c)) {
     19			if (namelen)
     20				endname = 1;
     21			continue;
     22		} else if (c == '?')
     23			; /* ignore */
     24		else if (c == '=') {
     25			x->name[namelen] = '\0';
     26			valuestart = 1;
     27			endname = 1;
     28		} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
     29			/* attribute without value */
     30			x->name[namelen] = '\0';
     31			if (x->xmlattrstart)
     32				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     33			if (x->xmlattr)
     34				x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
     35			if (x->xmlattrend)
     36				x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
     37			endname = 0;
     38			x->name[0] = c;
     39			namelen = 1;
     40		} else if (namelen && valuestart) {
     41			/* attribute with value */
     42			if (x->xmlattrstart)
     43				x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
     44
     45			valuelen = 0;
     46			if (c == '\'' || c == '"') {
     47				endsep = c;
     48			} else {
     49				endsep = ' '; /* ISSPACE() */
     50				goto startvalue;
     51			}
     52
     53			while ((c = GETNEXT()) != EOF) {
     54startvalue:
     55				if (c == '&') { /* entities */
     56					x->data[valuelen] = '\0';
     57					/* call data function with data before entity if there is data */
     58					if (valuelen && x->xmlattr)
     59						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     60					x->data[0] = c;
     61					valuelen = 1;
     62					while ((c = GETNEXT()) != EOF) {
     63						if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
     64							break;
     65						if (valuelen < sizeof(x->data) - 1)
     66							x->data[valuelen++] = c;
     67						else {
     68							/* entity too long for buffer, handle as normal data */
     69							x->data[valuelen] = '\0';
     70							if (x->xmlattr)
     71								x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     72							x->data[0] = c;
     73							valuelen = 1;
     74							break;
     75						}
     76						if (c == ';') {
     77							x->data[valuelen] = '\0';
     78							if (x->xmlattrentity)
     79								x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     80							valuelen = 0;
     81							break;
     82						}
     83					}
     84				} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
     85					if (valuelen < sizeof(x->data) - 1) {
     86						x->data[valuelen++] = c;
     87					} else {
     88						x->data[valuelen] = '\0';
     89						if (x->xmlattr)
     90							x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     91						x->data[0] = c;
     92						valuelen = 1;
     93					}
     94				}
     95				if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
     96					x->data[valuelen] = '\0';
     97					if (x->xmlattr)
     98						x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
     99					if (x->xmlattrend)
    100						x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
    101					break;
    102				}
    103			}
    104			namelen = endname = valuestart = 0;
    105		} else if (namelen < sizeof(x->name) - 1) {
    106			x->name[namelen++] = c;
    107		}
    108		if (c == '>') {
    109			break;
    110		} else if (c == '/') {
    111			x->isshorttag = 1;
    112			x->name[0] = '\0';
    113			namelen = 0;
    114		}
    115	}
    116}
    117
    118static void
    119xml_parsecomment(XMLParser *x)
    120{
    121	int c, i = 0;
    122
    123	while ((c = GETNEXT()) != EOF) {
    124		if (c == '-') {
    125			if (++i > 2)
    126				i = 2;
    127			continue;
    128		} else if (c == '>' && i == 2) {
    129			return;
    130		} else if (i) {
    131			i = 0;
    132		}
    133	}
    134}
    135
    136static void
    137xml_parsecdata(XMLParser *x)
    138{
    139	size_t datalen = 0, i = 0;
    140	int c;
    141
    142	while ((c = GETNEXT()) != EOF) {
    143		if (c == ']' || c == '>') {
    144			if (x->xmlcdata && datalen) {
    145				x->data[datalen] = '\0';
    146				x->xmlcdata(x, x->data, datalen);
    147				datalen = 0;
    148			}
    149		}
    150
    151		if (c == ']') {
    152			if (++i > 2) {
    153				if (x->xmlcdata)
    154					for (; i > 2; i--)
    155						x->xmlcdata(x, "]", 1);
    156				i = 2;
    157			}
    158			continue;
    159		} else if (c == '>' && i == 2) {
    160			return;
    161		} else if (i) {
    162			if (x->xmlcdata)
    163				for (; i > 0; i--)
    164					x->xmlcdata(x, "]", 1);
    165			i = 0;
    166		}
    167
    168		if (datalen < sizeof(x->data) - 1) {
    169			x->data[datalen++] = c;
    170		} else {
    171			x->data[datalen] = '\0';
    172			if (x->xmlcdata)
    173				x->xmlcdata(x, x->data, datalen);
    174			x->data[0] = c;
    175			datalen = 1;
    176		}
    177	}
    178}
    179
    180static int
    181codepointtoutf8(long r, char *s)
    182{
    183	if (r == 0) {
    184		return 0; /* NUL byte */
    185	} else if (r <= 0x7F) {
    186		/* 1 byte: 0aaaaaaa */
    187		s[0] = r;
    188		return 1;
    189	} else if (r <= 0x07FF) {
    190		/* 2 bytes: 00000aaa aabbbbbb */
    191		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
    192		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
    193		return 2;
    194	} else if (r <= 0xFFFF) {
    195		/* 3 bytes: aaaabbbb bbcccccc */
    196		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
    197		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
    198		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
    199		return 3;
    200	} else {
    201		/* 4 bytes: 000aaabb bbbbcccc ccdddddd */
    202		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
    203		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
    204		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
    205		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
    206		return 4;
    207	}
    208}
    209
    210static int
    211namedentitytostr(const char *e, char *buf, size_t bufsiz)
    212{
    213	static const struct {
    214		const char *entity;
    215		int c;
    216	} entities[] = {
    217		{ "amp;",  '&'  },
    218		{ "lt;",   '<'  },
    219		{ "gt;",   '>'  },
    220		{ "apos;", '\'' },
    221		{ "quot;", '"'  },
    222	};
    223	size_t i;
    224
    225	/* buffer is too small */
    226	if (bufsiz < 2)
    227		return -1;
    228
    229	for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
    230		if (!strcmp(e, entities[i].entity)) {
    231			buf[0] = entities[i].c;
    232			buf[1] = '\0';
    233			return 1;
    234		}
    235	}
    236	return -1;
    237}
    238
    239static int
    240numericentitytostr(const char *e, char *buf, size_t bufsiz)
    241{
    242	long l;
    243	int len;
    244	char *end;
    245
    246	/* buffer is too small */
    247	if (bufsiz < 5)
    248		return -1;
    249
    250	errno = 0;
    251	/* hex (16) or decimal (10) */
    252	if (*e == 'x')
    253		l = strtol(++e, &end, 16);
    254	else
    255		l = strtol(e, &end, 10);
    256	/* invalid value or not a well-formed entity or invalid code point */
    257	if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
    258	    (l >= 0xd800 && l <= 0xdfff))
    259		return -1;
    260	len = codepointtoutf8(l, buf);
    261	buf[len] = '\0';
    262
    263	return len;
    264}
    265
    266/* convert named- or numeric entity string to buffer string
    267 * returns byte-length of string or -1 on failure. */
    268int
    269xml_entitytostr(const char *e, char *buf, size_t bufsiz)
    270{
    271	/* doesn't start with & */
    272	if (e[0] != '&')
    273		return -1;
    274	/* numeric entity */
    275	if (e[1] == '#')
    276		return numericentitytostr(e + 2, buf, bufsiz);
    277	else /* named entity */
    278		return namedentitytostr(e + 1, buf, bufsiz);
    279}
    280
    281void
    282xml_parse(XMLParser *x)
    283{
    284	size_t datalen, tagdatalen;
    285	int c, isend;
    286
    287	while ((c = GETNEXT()) != EOF && c != '<')
    288		; /* skip until < */
    289
    290	while (c != EOF) {
    291		if (c == '<') { /* parse tag */
    292			if ((c = GETNEXT()) == EOF)
    293				return;
    294
    295			if (c == '!') { /* CDATA and comments */
    296				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
    297					/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
    298					if (tagdatalen <= sizeof("[CDATA[") - 1)
    299						x->data[tagdatalen++] = c;
    300					if (c == '>')
    301						break;
    302					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
    303							(x->data[0] == '-')) {
    304						xml_parsecomment(x);
    305						break;
    306					} else if (c == '[') {
    307						if (tagdatalen == sizeof("[CDATA[") - 1 &&
    308						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
    309							xml_parsecdata(x);
    310							break;
    311						}
    312					}
    313				}
    314			} else {
    315				/* normal tag (open, short open, close), processing instruction. */
    316				x->tag[0] = c;
    317				x->taglen = 1;
    318				x->isshorttag = isend = 0;
    319
    320				/* treat processing instruction as short tag, don't strip "?" prefix. */
    321				if (c == '?') {
    322					x->isshorttag = 1;
    323				} else if (c == '/') {
    324					if ((c = GETNEXT()) == EOF)
    325						return;
    326					x->tag[0] = c;
    327					isend = 1;
    328				}
    329
    330				while ((c = GETNEXT()) != EOF) {
    331					if (c == '/')
    332						x->isshorttag = 1; /* short tag */
    333					else if (c == '>' || ISSPACE(c)) {
    334						x->tag[x->taglen] = '\0';
    335						if (isend) { /* end tag, starts with </ */
    336							if (x->xmltagend)
    337								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
    338							x->tag[0] = '\0';
    339							x->taglen = 0;
    340						} else {
    341							/* start tag */
    342							if (x->xmltagstart)
    343								x->xmltagstart(x, x->tag, x->taglen);
    344							if (ISSPACE(c))
    345								xml_parseattrs(x);
    346							if (x->xmltagstartparsed)
    347								x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
    348						}
    349						/* call tagend for short tag or processing instruction */
    350						if (x->isshorttag) {
    351							if (x->xmltagend)
    352								x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
    353							x->tag[0] = '\0';
    354							x->taglen = 0;
    355						}
    356						break;
    357					} else if (x->taglen < sizeof(x->tag) - 1)
    358						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
    359				}
    360			}
    361		} else {
    362			/* parse tag data */
    363			datalen = 0;
    364			while ((c = GETNEXT()) != EOF) {
    365				if (c == '&') {
    366					if (datalen) {
    367						x->data[datalen] = '\0';
    368						if (x->xmldata)
    369							x->xmldata(x, x->data, datalen);
    370					}
    371					x->data[0] = c;
    372					datalen = 1;
    373					while ((c = GETNEXT()) != EOF) {
    374						if (c == '<')
    375							break;
    376						if (datalen < sizeof(x->data) - 1)
    377							x->data[datalen++] = c;
    378						else {
    379							/* entity too long for buffer, handle as normal data */
    380							x->data[datalen] = '\0';
    381							if (x->xmldata)
    382								x->xmldata(x, x->data, datalen);
    383							x->data[0] = c;
    384							datalen = 1;
    385							break;
    386						}
    387						if (c == ';') {
    388							x->data[datalen] = '\0';
    389							if (x->xmldataentity)
    390								x->xmldataentity(x, x->data, datalen);
    391							datalen = 0;
    392							break;
    393						}
    394					}
    395				} else if (c != '<') {
    396					if (datalen < sizeof(x->data) - 1) {
    397						x->data[datalen++] = c;
    398					} else {
    399						x->data[datalen] = '\0';
    400						if (x->xmldata)
    401							x->xmldata(x, x->data, datalen);
    402						x->data[0] = c;
    403						datalen = 1;
    404					}
    405				}
    406				if (c == '<') {
    407					x->data[datalen] = '\0';
    408					if (x->xmldata && datalen)
    409						x->xmldata(x, x->data, datalen);
    410					break;
    411				}
    412			}
    413		}
    414	}
    415}