sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit 37f5dce067ff8a8445a81b3db53ed3173f39f226
parent 45fd1505051a4c9c8e20c477d4ea62cfd7535276
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 23 Aug 2015 17:30:50 +0200

simplify field map code (add list tagid -> field.

also:
- parse tag media:description for RSS.
- be more strict about using the order of fields, this is more consistent now.
- remove buffer_init: don't allocate buffers on start.
- realloc, be slightly more aggresive with memory allocating: initial buffer size 16 to 64 bytes.

Diffstat:
Msfeed.c | 283+++++++++++++++++++++++++++++++++++++------------------------------------------
Mutil.h | 8+++++---
2 files changed, 136 insertions(+), 155 deletions(-)

diff --git a/sfeed.c b/sfeed.c @@ -34,18 +34,6 @@ static const char *contenttypes[] = { "", "plain", "html" }; static const int FieldSeparator = '\t'; /* output field seperator character */ static const char *baseurl = ""; -enum TagId { - TagUnknown = 0, - /* RSS */ - RSSTagDcdate, RSSTagPubdate, RSSTagTitle, - RSSTagDescription, RSSTagContentencoded, - RSSTagGuid, RSSTagLink, RSSTagDccreator, RSSTagAuthor, - /* Atom */ - AtomTagPublished, AtomTagUpdated, AtomTagTitle, - AtomTagMediaDescription, AtomTagSummary, AtomTagContent, - AtomTagId, AtomTagLink, AtomTagAuthor -}; - /* String data / memory pool */ typedef struct string { char *data; /* data */ @@ -53,17 +41,26 @@ typedef struct string { size_t bufsiz; /* allocated size */ } String; -/* Feed item */ -typedef struct feeditem { - String timestamp; - String title; - String link; - String content; - enum ContentType contenttype; - String id; - String author; - enum FeedType feedtype; -} FeedItem; +/* NOTE: the order of these fields (content, date, author) indicate the + * priority to used them, from low to high. */ +enum TagId { + TagUnknown = 0, + /* RSS */ + RSSTagDcdate, RSSTagPubdate, + RSSTagTitle, + RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, + RSSTagGuid, + RSSTagLink, + RSSTagAuthor, RSSTagDccreator, + /* Atom */ + AtomTagUpdated, AtomTagPublished, + AtomTagTitle, + AtomTagMediaDescription, AtomTagSummary, AtomTagContent, + AtomTagId, + AtomTagLink, + AtomTagAuthor, + TagLast, +}; typedef struct feedtag { char *name; /* name of tag to match */ @@ -71,13 +68,20 @@ typedef struct feedtag { enum TagId id; /* unique ID */ } FeedTag; +typedef struct field { + String str; + enum TagId tagid; /* tagid set previously, used for tag priority */ +} ItemField; + typedef struct feedcontext { - String *field; /* pointer to current FeedItem field String */ - FeedItem item; /* data for current feed item */ - enum TagId tagid; /* unique number for parsed tag */ - int iscontent; /* in content data */ - int iscontenttag; /* in content tag */ - int attrcount; + String *field; /* current FeedItem field String */ + ItemField fields[FieldLast]; /* data for current feed item */ + enum TagId tagid; /* unique number for parsed tag */ + int iscontent; /* in content data */ + int iscontenttag; /* in content tag */ + int attrcount; + enum ContentType contenttype; /* content-type for current item. */ + enum FeedType feedtype; } FeedContext; static enum TagId gettag(enum FeedType, const char *, size_t); @@ -87,7 +91,6 @@ static int istag(const char *, size_t, const char *, size_t); static int parsetime(const char *, char *, size_t, time_t *); static void printfields(void); static void string_append(String *, const char *, size_t); -static void string_buffer_init(String *, size_t); static void string_buffer_realloc(String *, size_t); static void string_clear(String *); static void string_print_encoded(String *); @@ -109,21 +112,47 @@ static void xml_handler_start_el_parsed(XMLParser *, const char *, static FeedContext ctx; static XMLParser parser; /* XML parser state */ +/* map tag type to field */ +static int fieldmap[TagLast] = { + /* RSS */ + [RSSTagDcdate] = FieldTimeFormatted, + [RSSTagPubdate] = FieldTimeFormatted, + [RSSTagTitle] = FieldTitle, + [RSSTagMediaDescription] = FieldContent, + [RSSTagDescription] = FieldContent, + [RSSTagContentEncoded] = FieldContent, + [RSSTagGuid] = FieldId, + [RSSTagLink] = FieldLink, + [RSSTagAuthor] = FieldAuthor, + [RSSTagDccreator] = FieldAuthor, + /* Atom */ + [AtomTagUpdated] = FieldTimeFormatted, + [AtomTagPublished] = FieldTimeFormatted, + [AtomTagTitle] = FieldTitle, + [AtomTagMediaDescription] = FieldContent, + [AtomTagSummary] = FieldContent, + [AtomTagContent] = FieldContent, + [AtomTagId] = FieldId, + [AtomTagLink] = FieldLink, + [AtomTagAuthor] = FieldAuthor +}; + /* Unique id for parsed tag. */ static enum TagId gettag(enum FeedType feedtype, const char *name, size_t namelen) { /* RSS, alphabetical order */ static FeedTag rsstags[] = { - { STRP("author"), RSSTagAuthor }, - { STRP("content:encoded"), RSSTagContentencoded }, - { STRP("dc:creator"), RSSTagDccreator }, - { STRP("dc:date"), RSSTagDcdate }, - { STRP("description"), RSSTagDescription }, - { STRP("guid"), RSSTagGuid }, - { STRP("link"), RSSTagLink }, - { STRP("pubdate"), RSSTagPubdate }, - { STRP("title"), RSSTagTitle }, + { STRP("author"), RSSTagAuthor }, + { STRP("content:encoded"), RSSTagContentEncoded }, + { STRP("dc:creator"), RSSTagDccreator }, + { STRP("dc:date"), RSSTagDcdate }, + { STRP("description"), RSSTagDescription }, + { STRP("guid"), RSSTagGuid }, + { STRP("media:description"), RSSTagMediaDescription }, + { STRP("link"), RSSTagLink }, + { STRP("pubdate"), RSSTagPubdate }, + { STRP("title"), RSSTagTitle }, { NULL, 0, -1 } }; /* Atom, alphabetical order */ @@ -172,21 +201,12 @@ string_clear(String *s) } static void -string_buffer_init(String *s, size_t len) -{ - if (!(s->data = malloc(len))) - err(1, "malloc"); - s->bufsiz = len; - string_clear(s); -} - -static void string_buffer_realloc(String *s, size_t newlen) { char *p; size_t alloclen; - for (alloclen = 16; alloclen <= newlen; alloclen *= 2) + for (alloclen = 64; alloclen <= newlen; alloclen *= 2) ; if (!(p = realloc(s->data, alloclen))) err(1, "realloc"); @@ -200,7 +220,7 @@ string_append(String *s, const char *data, size_t len) if (!len || *data == '\0') return; /* check if allocation is necesary, don't shrink buffer, - should be more than bufsiz ofcourse */ + * should be more than bufsiz ofcourse. */ if (s->len + len >= s->bufsiz) string_buffer_realloc(s, s->len + len + 1); memcpy(s->data + s->len, data, len); @@ -350,6 +370,9 @@ string_print_encoded(String *s) { const char *p, *e; + if (!s->data || !s->len) + return; + /* skip leading whitespace */ for (p = s->data; *p && isspace((int)*p); p++) ; @@ -378,6 +401,9 @@ string_print_trimmed(String *s) { const char *p, *e; + if (!s->data || !s->len) + return; + /* skip leading whitespace */ for (p = s->data; *p && isspace((int)*p); p++) ; @@ -399,33 +425,35 @@ printfields(void) { char link[4096], timebuf[64]; time_t t; - int r; + int r = -1; /* parse time, timestamp and formatted timestamp field is empty * if the parsed time is invalid */ - r = parsetime((&ctx.item.timestamp)->data, timebuf, - sizeof(timebuf), &t); + if (ctx.fields[FieldTimeFormatted].str.data) + r = parsetime(ctx.fields[FieldTimeFormatted].str.data, timebuf, + sizeof(timebuf), &t); if (r != -1) printf("%ld", (long)t); putchar(FieldSeparator); if (r != -1) fputs(timebuf, stdout); putchar(FieldSeparator); - string_print_trimmed(&ctx.item.title); + string_print_trimmed(&ctx.fields[FieldTitle].str); putchar(FieldSeparator); /* always print absolute urls */ - if (absuri(ctx.item.link.data, baseurl, link, sizeof(link)) != -1) + if (ctx.fields[FieldLink].str.data && + absuri(ctx.fields[FieldLink].str.data, baseurl, link, sizeof(link)) != -1) fputs(link, stdout); putchar(FieldSeparator); - string_print_encoded(&ctx.item.content); + string_print_encoded(&ctx.fields[FieldContent].str); putchar(FieldSeparator); - fputs(contenttypes[ctx.item.contenttype], stdout); + fputs(contenttypes[ctx.contenttype], stdout); putchar(FieldSeparator); - string_print_trimmed(&ctx.item.id); + string_print_trimmed(&ctx.fields[FieldId].str); putchar(FieldSeparator); - string_print_trimmed(&ctx.item.author); + string_print_trimmed(&ctx.fields[FieldAuthor].str); putchar(FieldSeparator); - fputs(feedtypes[ctx.item.feedtype], stdout); + fputs(feedtypes[ctx.feedtype], stdout); putchar('\n'); } @@ -451,12 +479,12 @@ xml_handler_attr(XMLParser *p, const char *tag, size_t taglen, /* handles transforming inline XML to data */ if (ISINCONTENT(ctx)) { - if (ctx.item.contenttype == ContentTypeHTML) + if (ctx.contenttype == ContentTypeHTML) xml_handler_data(p, value, valuelen); return; } - if (ctx.item.feedtype == FeedTypeAtom) { + if (ctx.feedtype == FeedTypeAtom) { if (ISCONTENTTAG(ctx)) { if (isattr(name, namelen, STRP("type")) && (isattr(value, valuelen, STRP("xhtml")) || @@ -464,13 +492,13 @@ xml_handler_attr(XMLParser *p, const char *tag, size_t taglen, isattr(value, valuelen, STRP("html")) || isattr(value, valuelen, STRP("text/html")))) { - ctx.item.contenttype = ContentTypeHTML; + ctx.contenttype = ContentTypeHTML; } } else if (ctx.tagid == AtomTagLink && isattr(name, namelen, STRP("href"))) { /* link href attribute */ - string_append(&ctx.item.link, value, valuelen); + string_append(&ctx.fields[FieldLink].str, value, valuelen); } } } @@ -484,7 +512,7 @@ xml_handler_attr_end(XMLParser *p, const char *tag, size_t taglen, (void)name; (void)namelen; - if (!ISINCONTENT(ctx) || ctx.item.contenttype != ContentTypeHTML) + if (!ISINCONTENT(ctx) || ctx.contenttype != ContentTypeHTML) return; /* handles transforming inline XML to data */ @@ -499,7 +527,7 @@ xml_handler_attr_start(XMLParser *p, const char *tag, size_t taglen, (void)tag; (void)taglen; - if (!ISINCONTENT(ctx) || ctx.item.contenttype != ContentTypeHTML) + if (!ISINCONTENT(ctx) || ctx.contenttype != ContentTypeHTML) return; /* handles transforming inline XML to data */ @@ -522,7 +550,7 @@ xml_handler_cdata(XMLParser *p, const char *s, size_t len) } /* NOTE: this handler can be called multiple times if the data in this - * block is bigger than the buffer. */ + * block is bigger than the buffer. */ static void xml_handler_data(XMLParser *p, const char *s, size_t len) { @@ -564,7 +592,7 @@ xml_handler_start_el(XMLParser *p, const char *name, size_t namelen) if (ISINCONTENT(ctx)) { ctx.attrcount = 0; - if (ctx.item.contenttype == ContentTypeHTML) { + if (ctx.contenttype == ContentTypeHTML) { xml_handler_data(p, "<", 1); xml_handler_data(p, name, namelen); } @@ -572,17 +600,17 @@ xml_handler_start_el(XMLParser *p, const char *name, size_t namelen) } /* start of RSS or Atom item / entry */ - if (ctx.item.feedtype == FeedTypeNone) { + if (ctx.feedtype == FeedTypeNone) { if (istag(name, namelen, STRP("entry"))) { /* Atom */ - ctx.item.feedtype = FeedTypeAtom; + ctx.feedtype = FeedTypeAtom; /* default content type for Atom */ - ctx.item.contenttype = ContentTypePlain; + ctx.contenttype = ContentTypePlain; } else if (istag(name, namelen, STRP("item"))) { /* RSS */ - ctx.item.feedtype = FeedTypeRSS; + ctx.feedtype = FeedTypeRSS; /* default content type for RSS */ - ctx.item.contenttype = ContentTypeHTML; + ctx.contenttype = ContentTypeHTML; } return; } @@ -592,64 +620,23 @@ xml_handler_start_el(XMLParser *p, const char *name, size_t namelen) return; /* in item */ - tagid = gettag(ctx.item.feedtype, name, namelen); - if (tagid != TagUnknown) - ctx.tagid = tagid; - - switch (ctx.tagid) { - case RSSTagPubdate: - case RSSTagDcdate: - ctx.field = &ctx.item.timestamp; - break; - case AtomTagPublished: - case AtomTagUpdated: - /* prefer published over updated if set */ - if (ctx.tagid != AtomTagUpdated || !ctx.item.timestamp.len) - ctx.field = &ctx.item.timestamp; - break; - case RSSTagTitle: - ctx.field = &ctx.item.title; - break; - case AtomTagTitle: - ctx.field = &ctx.item.title; - break; - case RSSTagLink: - case AtomTagLink: - ctx.field = &ctx.item.link; - break; - case RSSTagDescription: - case RSSTagContentencoded: - /* prefer content:encoded over description if set */ - if (ctx.tagid != RSSTagDescription || !ctx.item.content.len) { - ctx.iscontenttag = 1; - ctx.field = &ctx.item.content; - } - break; - case AtomTagMediaDescription: - case AtomTagSummary: - case AtomTagContent: - /* prefer content over summary and media:description if set */ - if ((ctx.tagid != AtomTagMediaDescription && - ctx.tagid != AtomTagSummary) || !ctx.item.content.len) { - ctx.iscontenttag = 1; - ctx.field = &ctx.item.content; - } - break; - case RSSTagGuid: - case AtomTagId: - ctx.field = &ctx.item.id; - break; - case RSSTagAuthor: - case RSSTagDccreator: - case AtomTagAuthor: - ctx.field = &ctx.item.author; - break; - default: + tagid = gettag(ctx.feedtype, name, namelen); + ctx.tagid = tagid; + if (tagid == TagUnknown) { + ctx.field = NULL; + return; + } + + /* map tag type to field */ + if (tagid <= ctx.fields[fieldmap[ctx.tagid]].tagid) { ctx.field = NULL; + return; /* priority */ } + ctx.iscontenttag = (fieldmap[ctx.tagid] == FieldContent); + ctx.field = &(ctx.fields[fieldmap[ctx.tagid]].str); + ctx.fields[fieldmap[ctx.tagid]].tagid = tagid; /* clear field */ - if (ctx.field) - string_clear(ctx.field); + string_clear(ctx.field); } static void @@ -665,7 +652,7 @@ xml_handler_start_el_parsed(XMLParser *p, const char *tag, size_t taglen, return; } - if (!ISINCONTENT(ctx) || ctx.item.contenttype != ContentTypeHTML) + if (!ISINCONTENT(ctx) || ctx.contenttype != ContentTypeHTML) return; if (isshort) @@ -677,39 +664,39 @@ xml_handler_start_el_parsed(XMLParser *p, const char *tag, size_t taglen, static void xml_handler_end_el(XMLParser *p, const char *name, size_t namelen, int isshort) { - if (ctx.item.feedtype == FeedTypeNone) + size_t i; + + if (ctx.feedtype == FeedTypeNone) return; if (ISINCONTENT(ctx)) { /* not close content field */ - if (gettag(ctx.item.feedtype, name, namelen) != ctx.tagid) { - if (!isshort && ctx.item.contenttype == ContentTypeHTML) { + if (gettag(ctx.feedtype, name, namelen) != ctx.tagid) { + if (!isshort && ctx.contenttype == ContentTypeHTML) { xml_handler_data(p, "</", 2); xml_handler_data(p, name, namelen); xml_handler_data(p, ">", 1); } return; } - } else if (!ctx.tagid && ((ctx.item.feedtype == FeedTypeAtom && + } else if (!ctx.tagid && ((ctx.feedtype == FeedTypeAtom && istag(name, namelen, STRP("entry"))) || /* Atom */ - (ctx.item.feedtype == FeedTypeRSS && + (ctx.feedtype == FeedTypeRSS && istag(name, namelen, STRP("item"))))) /* RSS */ { /* end of RSS or Atom entry / item */ printfields(); /* clear strings */ - string_clear(&ctx.item.timestamp); - string_clear(&ctx.item.title); - string_clear(&ctx.item.link); - string_clear(&ctx.item.content); - string_clear(&ctx.item.id); - string_clear(&ctx.item.author); - - ctx.item.feedtype = FeedTypeNone; - ctx.item.contenttype = ContentTypeNone; + for (i = 0; i < FieldLast; i++) { + string_clear(&ctx.fields[i].str); + ctx.fields[i].tagid = TagUnknown; + } + ctx.contenttype = ContentTypeNone; + /* allow parsing of Atom and RSS in one XML stream. */ + ctx.feedtype = FeedTypeNone; } else if (!ctx.tagid || - gettag(ctx.item.feedtype, name, namelen) != ctx.tagid) { + gettag(ctx.feedtype, name, namelen) != ctx.tagid) { /* not end of field */ return; } @@ -725,14 +712,6 @@ main(int argc, char *argv[]) if (argc > 1) baseurl = argv[1]; - /* init strings and initial memory pool size */ - string_buffer_init(&ctx.item.timestamp, 64); - string_buffer_init(&ctx.item.title, 256); - string_buffer_init(&ctx.item.link, 1024); - string_buffer_init(&ctx.item.content, 4096); - string_buffer_init(&ctx.item.id, 1024); - string_buffer_init(&ctx.item.author, 256); - parser.xmlattr = xml_handler_attr; parser.xmlattrend = xml_handler_attr_end; parser.xmlattrstart = xml_handler_attr_start; diff --git a/util.h b/util.h @@ -20,9 +20,11 @@ struct uri { char port[6]; /* numeric port */ }; -enum { FieldUnixTimestamp = 0, FieldTimeFormatted, FieldTitle, FieldLink, - FieldContent, FieldContentType, FieldId, FieldAuthor, FieldFeedType, - FieldLast }; +enum { + FieldUnixTimestamp = 0, FieldTimeFormatted, FieldTitle, + FieldLink, FieldContent, FieldContentType, FieldId, FieldAuthor, + FieldFeedType, FieldLast +}; int absuri(const char *, const char *, char *, size_t); int encodeuri(const char *, char *, size_t);