sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit b3c9ad3cc6a8ad77b9c643aafe3a290b9f67e39d
parent 19cd36545777e20ca03c066d4a29d9c626b86b57
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 24 Feb 2019 15:25:31 +0100

stricter Atom link parsing

the Atom link parsing is more strict now and checks the rel attribute. When the
rel attribute is empty it is handled as a normal link ("alternate").

This makes sure when an link with an other type is specified (such as
"enclosure", "related", "self" or "via") before a link it is not used.

sfeed does not handle enclosures, but the code is reworked so it is very simple
to add this. Enclosures are often used for example to attach some image to a
newspost or an audio file to a podcast.

Diffstat:
Msfeed.c | 43+++++++++++++++++++++++++++++++++++--------
1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/sfeed.c b/sfeed.c @@ -54,6 +54,7 @@ enum TagId { AtomTagMediaDescription, AtomTagSummary, AtomTagContent, AtomTagId, AtomTagLink, + AtomTagLinkAlternate, AtomTagAuthor, TagLast }; @@ -141,8 +142,10 @@ static FeedTag atomtags[] = { { STRP("updated"), AtomTagUpdated } }; -/* map tagid type to RSS/Atom field */ +/* map tagid type to RSS/Atom field + NOTE: all tags must be defined */ static int fieldmap[TagLast] = { + [TagUnknown] = -1, /* RSS */ [RSSTagDcdate] = FeedFieldTime, [RSSTagPubdate] = FeedFieldTime, @@ -162,7 +165,8 @@ static int fieldmap[TagLast] = { [AtomTagSummary] = FeedFieldContent, [AtomTagContent] = FeedFieldContent, [AtomTagId] = FeedFieldId, - [AtomTagLink] = FeedFieldLink, + [AtomTagLink] = -1, + [AtomTagLinkAlternate] = FeedFieldLink, [AtomTagAuthor] = FeedFieldAuthor }; @@ -172,6 +176,9 @@ static const char *baseurl = ""; static FeedContext ctx; static XMLParser parser; /* XML parser state */ +static String atomlink; +static int atomlinktype; + /* Unique tagid for parsed tag name. */ static enum TagId gettag(enum FeedType feedtype, const char *name, size_t namelen) @@ -619,11 +626,16 @@ xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, ctx.contenttype = ContentTypeHTML; } } else if (ctx.tagid == AtomTagLink && - isattr(n, nl, STRP("href")) && - ctx.field) - { - /* link href attribute */ - string_append(ctx.field, v, vl); + isattr(n, nl, STRP("rel"))) { + /* empty or "alternate": other types could be + "enclosure", "related", "self" or "via" */ + if (!vl || isattr(v, vl, STRP("alternate"))) + atomlinktype = AtomTagLinkAlternate; + else + atomlinktype = 0; + } else if (ctx.tagid == AtomTagLink && + isattr(n, nl, STRP("href"))) { + string_append(&atomlink, v, vl); } } } @@ -731,12 +743,19 @@ xmltagstart(XMLParser *p, const char *t, size_t tl) tagid = gettag(ctx.feedtype, t, tl); ctx.tagid = tagid; + /* without a rel attribute the default link type is "alternate" */ + if (tagid == AtomTagLink) { + atomlinktype = AtomTagLinkAlternate; + string_clear(&atomlink); /* reuse and clear temporary link */ + } + /* map tag type to field: unknown or lesser priority is ignored, when tags of the same type are repeated only the first is used. */ - if (tagid == TagUnknown || tagid <= ctx.fields[fieldmap[tagid]].tagid) { + if (fieldmap[tagid] == -1 || tagid <= ctx.fields[fieldmap[tagid]].tagid) { ctx.field = NULL; return; } + ctx.iscontenttag = (fieldmap[ctx.tagid] == FeedFieldContent); ctx.field = &(ctx.fields[fieldmap[ctx.tagid]].str); ctx.fields[fieldmap[ctx.tagid]].tagid = tagid; @@ -784,6 +803,14 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) } return; } + } else if (ctx.tagid == AtomTagLink) { + /* map tag type to field: unknown or lesser priority is ignored, + when tags of the same type are repeated only the first is used. */ + if (atomlinktype && atomlinktype > ctx.fields[fieldmap[atomlinktype]].tagid) { + string_append(&ctx.fields[fieldmap[atomlinktype]].str, + atomlink.data, atomlink.len); + ctx.fields[fieldmap[atomlinktype]].tagid = atomlinktype; + } } else if (!ctx.tagid && ((ctx.feedtype == FeedTypeAtom && istag(t, tl, STRP("entry"))) || /* Atom */ (ctx.feedtype == FeedTypeRSS &&