sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit 582131202a479c1e678cffa11318022258be445c
parent 5c724b8b1aba860eb8a48dc230fa417014a11ba7
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Fri, 14 Aug 2015 13:47:19 +0200

xml: separate reader context from parser

also:
- rename xmlparser_ prefix to xml_.
- make xml_parse public, this allows a custom reader like a direct mmap,
  see: XMLParser.getnext and (optionall) XMLParser.getnext_data.
- improve the README text.

Diffstat:
MREADME.xml | 29+++++++++++++++++++++++------
Msfeed.c | 2+-
Msfeed_opml_import.c | 2+-
Msfeed_web.c | 2+-
Msfeed_xmlenc.c | 2+-
Mxml.c | 110+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mxml.h | 21++++-----------------
7 files changed, 100 insertions(+), 68 deletions(-)

diff --git a/README.xml b/README.xml @@ -5,7 +5,7 @@ XML parser Dependencies ------------ -- C compiler (C99) +- C compiler (C99). Features @@ -25,19 +25,21 @@ Supports - Short attributes without an explicity set value (<input type="checkbox" checked />). - Comments - CDATA sections. +- Helper function (xml_entitytostr) to convert XML 1.0 / HTML 2.0 named entities + and numeric entities to UTF-8. +- Reading XML from a fd, string buffer or implement a custom reader: + see: XMLParser.getnext and XMLParser.getnext_data. Caveats ------- -- Internally static buffers are used, callbacks like XMLParser.xmldata are +- Internally fixed-size buffers are used, callbacks like XMLParser.xmldata are called multiple times for the same tag if the data size is bigger than the internal buffer size (sizeof(XMLParser.data)). To differentiate between new calls for data you can use the xml*start and xml*end handlers. -- There is no table of (HTML / XML) named entities you should handle this with - the XMLParser.xmldataentity handler yourself. -- The XML is not checked for errors so it will continue parsing invalid XML - data, this is by design. +- The XML is not checked for errors so it will continue parsing XML data, this + is by design. Files used @@ -51,6 +53,20 @@ Interface / API Should be trivial, see xml.c and xml.h and the examples below. +The most minimal implementation to read and parse from fd 0 (stdin) is: + + #include "xml.h" + + static XMLParser x; + + int + main(void) + { + xml_parse_fd(&x, 0); /* xml_parse_string(&x, "<sup />"); */ + + return 0; + } + Examples -------- @@ -60,5 +76,6 @@ sfeed_opml_import.c or sfeed_web.c or sfeed_xmlenc.c License ------- + See LICENSE file. diff --git a/sfeed.c b/sfeed.c @@ -737,7 +737,7 @@ main(int argc, char *argv[]) parser.xmltagstart = xml_handler_start_el; parser.xmltagstartparsed = xml_handler_start_el_parsed; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return 0; } diff --git a/sfeed_opml_import.c b/sfeed_opml_import.c @@ -87,7 +87,7 @@ main(void) "# list of feeds to fetch:\n" "feeds() {\n" " # feed <name> <feedurl> [basesiteurl] [encoding]\n", stdout); - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); fputs("}\n", stdout); return 0; diff --git a/sfeed_web.c b/sfeed_web.c @@ -94,7 +94,7 @@ main(int argc, char *argv[]) parser.xmltagstart = xmltagstart; parser.xmltagstartparsed = xmltagstartparsed; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return found > 0 ? 0: 1; } diff --git a/sfeed_xmlenc.c b/sfeed_xmlenc.c @@ -60,7 +60,7 @@ main(void) parser.xmltagstart = xmltagstart; parser.xmltagend = xmltagend; - xmlparser_parse_fd(&parser, 0); + xml_parse_fd(&parser, 0); return 1; } diff --git a/xml.c b/xml.c @@ -8,54 +8,75 @@ #include "xml.h" +struct xml_context_fd { + char buf[BUFSIZ]; + int readerrno; + int fd; + size_t nread; + size_t offset; +}; + +struct xml_context_string { + const char *str; +}; + +static int +xml_getnext_stdin(XMLParser *x) +{ + return getchar(); +} + static int -xmlparser_string_getnext(XMLParser *x) +xml_getnext_string(XMLParser *x) { - if (!*(x->str)) + struct xml_context_string *d = (struct xml_context_string *)x->getnext_data; + + if (!*(d->str)) return EOF; - return (int)*(x->str++); + return (int)*(d->str++); } static int /* like getc(), but do some smart buffering */ -xmlparser_fd_getnext(XMLParser *x) +xml_getnext_fd(XMLParser *x) { + struct xml_context_fd *d = (struct xml_context_fd *)x->getnext_data; ssize_t r; /* previous read error was set */ - if (x->readerrno) + if (d->readerrno) return EOF; - if (x->readoffset >= x->readlastbytes) { - x->readoffset = 0; + if (d->offset >= d->nread) { + d->offset = 0; again: - r = read(x->fd, x->readbuf, sizeof(x->readbuf)); + r = read(d->fd, d->buf, sizeof(d->buf)); if (r == -1) { if (errno == EINTR) goto again; - x->readerrno = errno; - x->readlastbytes = 0; + d->readerrno = errno; + d->nread = 0; return EOF; } else if (!r) { return EOF; } - x->readlastbytes = r; + d->nread = r; } - return (int)x->readbuf[x->readoffset++]; + return (int)d->buf[d->offset++]; } static int -xmlparser_getnext(XMLParser *x) +xml_getnext(XMLParser *x) { return x->getnext(x); } static void -xmlparser_parseattrs(XMLParser *x) +xml_parseattrs(XMLParser *x) { size_t namelen = 0, valuelen; int c, endsep, endname = 0; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (isspace(c)) { /* TODO: simplify endname ? */ if (namelen) endname = 1; @@ -82,7 +103,7 @@ xmlparser_parseattrs(XMLParser *x) endsep = c; /* c is end separator */ if (x->xmlattrstart) x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); - for (valuelen = 0; (c = xmlparser_getnext(x)) != EOF;) { + for (valuelen = 0; (c = xml_getnext(x)) != EOF;) { if (c == '&') { /* entities */ x->data[valuelen] = '\0'; /* call data function with data before entity if there is data */ @@ -90,7 +111,7 @@ xmlparser_parseattrs(XMLParser *x) x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); x->data[0] = c; valuelen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == endsep) break; if (valuelen < sizeof(x->data) - 1) @@ -147,7 +168,7 @@ xmlparser_parseattrs(XMLParser *x) } static void -xmlparser_parsecomment(XMLParser *x) +xml_parsecomment(XMLParser *x) { static const char *end = "-->"; size_t datalen = 0, i = 0; @@ -156,7 +177,7 @@ xmlparser_parsecomment(XMLParser *x) if (x->xmlcommentstart) x->xmlcommentstart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == end[i]) { if (end[++i] == '\0') { /* end */ x->data[datalen] = '\0'; @@ -191,7 +212,7 @@ xmlparser_parsecomment(XMLParser *x) } static void -xmlparser_parsecdata(XMLParser *x) +xml_parsecdata(XMLParser *x) { static const char *end = "]]>"; size_t datalen = 0, i = 0; @@ -200,7 +221,7 @@ xmlparser_parsecdata(XMLParser *x) if (x->xmlcdatastart) x->xmlcdatastart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == end[i]) { if (end[++i] == '\0') { /* end */ x->data[datalen] = '\0'; @@ -351,44 +372,44 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz) return xml_numericentitytostr(e, buf, bufsiz); } -static void -xmlparser_parse(XMLParser *x) +void +xml_parse(XMLParser *x) { int c, ispi; size_t datalen, tagdatalen, taglen; - while ((c = xmlparser_getnext(x)) != EOF && c != '<') + while ((c = xml_getnext(x)) != EOF && c != '<') ; /* skip until < */ while (c != EOF) { if (c == '<') { /* parse tag */ - if ((c = xmlparser_getnext(x)) == EOF) + if ((c = xml_getnext(x)) == EOF) return; x->tag[0] = '\0'; x->taglen = 0; if (c == '!') { /* cdata and comments */ - for (tagdatalen = 0; (c = xmlparser_getnext(x)) != EOF;) { + for (tagdatalen = 0; (c = xml_getnext(x)) != EOF;) { if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */ x->data[tagdatalen++] = c; /* TODO: prevent overflow */ if (c == '>') break; else if (c == '-' && tagdatalen == sizeof("--") - 1 && (x->data[0] == '-')) { /* comment */ - xmlparser_parsecomment(x); + xml_parsecomment(x); break; } else if (c == '[') { if (tagdatalen == sizeof("[CDATA[") - 1 && x->data[1] == 'C' && x->data[2] == 'D' && x->data[3] == 'A' && x->data[4] == 'T' && x->data[5] == 'A' && x->data[6] == '[') { /* CDATA */ - xmlparser_parsecdata(x); + xml_parsecdata(x); break; } } } } else { /* normal tag (open, short open, close), processing instruction. */ if (isspace(c)) - while ((c = xmlparser_getnext(x)) != EOF && isspace(c)) + while ((c = xml_getnext(x)) != EOF && isspace(c)) ; if (c == EOF) return; @@ -396,7 +417,7 @@ xmlparser_parse(XMLParser *x) ispi = (c == '?') ? 1 : 0; x->isshorttag = ispi; taglen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '/') /* TODO: simplify short tag? */ x->isshorttag = 1; /* short tag */ else if (c == '>' || isspace(c)) { @@ -411,7 +432,7 @@ xmlparser_parse(XMLParser *x) if (x->xmltagstart) x->xmltagstart(x, x->tag, x->taglen); if (isspace(c)) - xmlparser_parseattrs(x); + xml_parseattrs(x); if (x->xmltagstartparsed) x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); } @@ -428,7 +449,7 @@ xmlparser_parse(XMLParser *x) datalen = 0; if (x->xmldatastart) x->xmldatastart(x); - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '&') { if (datalen) { x->data[datalen] = '\0'; @@ -437,7 +458,7 @@ xmlparser_parse(XMLParser *x) } x->data[0] = c; datalen = 1; - while ((c = xmlparser_getnext(x)) != EOF) { + while ((c = xml_getnext(x)) != EOF) { if (c == '<') break; if (datalen < sizeof(x->data) - 1) @@ -477,17 +498,24 @@ xmlparser_parse(XMLParser *x) } void -xmlparser_parse_string(XMLParser *x, const char *s) +xml_parse_string(XMLParser *x, const char *s) { - x->str = s; - x->getnext = xmlparser_string_getnext; - xmlparser_parse(x); + struct xml_context_string ctx = { .str = s }; + + x->getnext = xml_getnext_string; + x->getnext_data = (void *)&ctx; + xml_parse(x); } void -xmlparser_parse_fd(XMLParser *x, int fd) +xml_parse_fd(XMLParser *x, int fd) { - x->fd = fd; - x->getnext = xmlparser_fd_getnext; - xmlparser_parse(x); + struct xml_context_fd ctx; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fd = fd; + + x->getnext = xml_getnext_fd; + x->getnext_data = (void *)&ctx; + xml_parse(x); } diff --git a/xml.h b/xml.h @@ -24,16 +24,7 @@ typedef struct xmlparser { size_t, int); int (*getnext)(struct xmlparser *); - - /* for use with xmlparser_parse_fd */ - /* errno set from read(). */ - int readerrno; - int fd; - - /* for use with "read" from string: xmlparser_parse_string */ - const char *str; - - /* private; internal state */ + void *getnext_data; /* custom data for getnext */ /* current tag */ char tag[1024]; @@ -44,11 +35,6 @@ typedef struct xmlparser { char name[256]; /* data buffer used for tag data, cdata and attribute data */ char data[BUFSIZ]; - - size_t readoffset; - size_t readlastbytes; - /* read buffer used by xmlparser_parse_fd */ - unsigned char readbuf[BUFSIZ]; } XMLParser; int xml_codepointtoutf8(uint32_t, uint32_t *); @@ -56,5 +42,6 @@ ssize_t xml_entitytostr(const char *, char *, size_t); ssize_t xml_namedentitytostr(const char *, char *, size_t); ssize_t xml_numericetitytostr(const char *, char *, size_t); -void xmlparser_parse_fd(XMLParser *, int); -void xmlparser_parse_string(XMLParser *, const char *); +void xml_parse(XMLParser *); +void xml_parse_fd(XMLParser *, int); +void xml_parse_string(XMLParser *, const char *);