sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit e771e43d51830ec7d2a19d9d4e67cded83c1b302
parent f054e581dac4921b302e0459a40d1b4f1fbd28ae
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Wed, 21 Oct 2020 22:06:58 +0200

sfeed_web: attribute parsing improvements, improve man page

Fix attribute parsing and now decode entities. The following now works (from
helsinkitimes.fi):

	<base href="https://www.helsinkitimes.fi/" />
	<link href="/?format=feed&amp;type=rss" rel="alternate" type="application/rss+xml" title="RSS 2.0" />
	<link href="/?format=feed&amp;type=atom" rel="alternate" type="application/atom+xml" title="Atom 1.0" />

Properly associate attributes with the actual tag, this now parses properly
(from ascii.jp).

	<link rel="apple-touch-icon-precomposed" href="/img/apple-touch-icon.png" />
	<link rel="alternate" type="application/rss+xml"  />

Diffstat:
Msfeed_web.1 | 13++++++-------
Msfeed_web.c | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
2 files changed, 72 insertions(+), 35 deletions(-)

diff --git a/sfeed_web.1 b/sfeed_web.1 @@ -1,4 +1,4 @@ -.Dd March 15, 2020 +.Dd October 22, 2020 .Dt SFEED_WEB 1 .Os .Sh NAME @@ -21,13 +21,12 @@ url<TAB>content\-type<newline> .Bl -tag -width Ds .It url Found relative or absolute url. -If the url is relative and the +.Pp +For relative urls if a <base href="..." /> tag is found it will be used, +otherwise if the .Ar baseurl -option is -specified then the url is made absolute. -If the url is relative and no -.Ar baseurl -option is specified then it is printed as is. +option is specified then that is used, if neither are set then the relative url +is printed. .It content\-type Usually application/atom+xml or application/rss+xml. .El diff --git a/sfeed_web.c b/sfeed_web.c @@ -10,65 +10,101 @@ #define STRP(s) s,sizeof(s)-1 static XMLParser parser; -static int isbase, islink, isfeedlink; -static char abslink[4096], feedlink[4096], basehref[4096], feedtype[256]; +static int isbasetag, islinktag, ishrefattr, istypeattr; +static char linkhref[4096], linktype[256], basehref[4096]; +static char abslink[4096]; static void -printfeedtype(const char *s, FILE *fp) +printvalue(const char *s) { for (; *s; s++) - if (!isspace((unsigned char)*s)) - fputc(*s, fp); + if (!iscntrl((unsigned char)*s)) + putchar(*s); } static void xmltagstart(XMLParser *p, const char *t, size_t tl) { - isbase = islink = isfeedlink = 0; - feedlink[0] = '\0'; + isbasetag = islinktag = 0; if (!strcasecmp(t, "base")) - isbase = 1; + isbasetag = 1; else if (!strcasecmp(t, "link")) - islink = 1; + islinktag = 1; } static void xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) { - if (!isfeedlink) + if (!islinktag) return; - if (absuri(abslink, sizeof(abslink), feedlink, basehref) != -1) - fputs(abslink, stdout); + if (strncasecmp(linktype, STRP("application/atom")) && + strncasecmp(linktype, STRP("application/xml")) && + strncasecmp(linktype, STRP("application/rss"))) + return; + + if (absuri(abslink, sizeof(abslink), linkhref, basehref) != -1) + printvalue(abslink); else - fputs(feedlink, stdout); + printvalue(linkhref); putchar('\t'); - printfeedtype(feedtype, stdout); + printvalue(linktype); putchar('\n'); } static void +xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *a, size_t al) +{ + ishrefattr = istypeattr = 0; + + if (!isbasetag && !islinktag) + return; + + if (!strcasecmp(a, "href")) { + ishrefattr = 1; + if (isbasetag) + basehref[0] = '\0'; + else if (islinktag) + linkhref[0] = '\0'; + } else if (!strcasecmp(a, "type") && islinktag) { + istypeattr = 1; + linktype[0] = '\0'; + } +} + +static void xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, const char *v, size_t vl) { - if (isbase) { - if (!strcasecmp(n, "href")) - strlcpy(basehref, v, sizeof(basehref)); - } else if (islink) { - if (!strcasecmp(n, "type")) { - if (!strncasecmp(v, STRP("application/atom")) || - !strncasecmp(v, STRP("application/xml")) || - !strncasecmp(v, STRP("application/rss"))) { - isfeedlink = 1; - strlcpy(feedtype, v, sizeof(feedtype)); - } - } else if (!strcasecmp(n, "href")) { - strlcpy(feedlink, v, sizeof(feedlink)); - } + if (isbasetag && ishrefattr) { + strlcat(basehref, v, sizeof(basehref)); + } else if (islinktag) { + if (ishrefattr) + strlcat(linkhref, v, sizeof(linkhref)); + else if (istypeattr) + strlcat(linktype, v, sizeof(linktype)); } } +static void +xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *a, size_t al, + const char *v, size_t vl) +{ + char buf[16]; + int len; + + if (!ishrefattr && !istypeattr) + return; + + /* try to translate entity, else just pass as data to + * xmlattr handler. */ + if ((len = xml_entitytostr(v, buf, sizeof(buf))) > 0) + xmlattr(p, t, tl, a, al, buf, (size_t)len); + else + xmlattr(p, t, tl, a, al, v, vl); +} + int main(int argc, char *argv[]) { @@ -79,6 +115,8 @@ main(int argc, char *argv[]) strlcpy(basehref, argv[1], sizeof(basehref)); parser.xmlattr = xmlattr; + parser.xmlattrentity = xmlattrentity; + parser.xmlattrstart = xmlattrstart; parser.xmltagstart = xmltagstart; parser.xmltagstartparsed = xmltagstartparsed;