sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit 356e7d79925f91b9b703ee63e3680694c53a59a4
parent eb586eda26967183de91c314a57d323b124110bb
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Fri, 31 Jul 2015 21:06:52 +0200

Various improvements

- Only escape characters in "content" field, these can contain newlines.
- Trim newlines and tabs, etc from the title, id and author fields.
- Make decodefield, xmlencode functions easier to "chain" without allocatting
  new buffers.
- Move printutf8pad from util (only used by sfeed_plain) to sfeed_plain.
- Update README, still need to update the man-page and improve the documentation
  in general.
- Code cleanup.

Diffstat:
MREADME | 23++++++++++++++---------
Msfeed.c | 172++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Msfeed_frames.c | 21++++++++++++---------
Msfeed_html.c | 14+++++++-------
Msfeed_mbox.c | 15++++++++++-----
Msfeed_plain.c | 32+++++++++++++++++++++++++++++---
Mutil.c | 105++++++++++++++++++++++++++-----------------------------------------------------
Mutil.h | 8+++-----
8 files changed, 214 insertions(+), 176 deletions(-)

diff --git a/README b/README @@ -78,25 +78,30 @@ feeds.new - Temporary file used by sfeed_update to merge items. TAB-separated format -------------------- -The items are saved in a TSV-like format except newlines, tabs and -backslash are escaped with \ (\n, \t and \\). Other whitespace except -spaces are removed. +The items are saved in a TSV-like format. + +The fields: title, id, author are not allowed to have newlines, tabs, all +whitespace is replaced by a single space character. Control characters are +removed. + +The content field can contain newlines and is escaped. TABs, newline and '\' +are escaped with '\', so: '\n', '\t', and '\\'. Other whitespace characters +except space are removed. Control characters are also removed. The timestamp field is converted to a UNIX timestamp. The timestamp is also -stored as formatted as a separate field. The other fields are left untouched -(including HTML). +stored as formatted as a separate field. The order and format of the fields are: -item UNIX timestamp - string UNIX timestamp (UTC+0) +item UNIX timestamp - string UNIX timestamp (UTC+0). item formatted timestamp - string timestamp, YYYY-mm-dd HH:MM:SS (UTC[+-]HH:MM)|tz item title - string -item link - string, absolute url, unsafe characters are encoded +item link - string, absolute url, unsafe characters are encoded. item content - string -item contenttype - string, "html" or "plain" +item contenttype - string, "html" or "plain". item id - string item author - string -feed type - string, "rss" or "atom" +feed type - string, "rss" or "atom". CAVEAT: if a timezone is not supported (non-RFC-822) the UNIX timestamp is interpreted as UTC+0. diff --git a/sfeed.c b/sfeed.c @@ -18,16 +18,16 @@ /* length of string */ #define STRSIZ(s) (sizeof(s)-1) -enum { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 }; +enum FeedType { FeedTypeNone = 0, FeedTypeRSS = 1, FeedTypeAtom = 2 }; static const char *feedtypes[] = { "", "rss", "atom" }; -enum { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 }; +enum ContentType { ContentTypeNone = 0, ContentTypePlain = 1, ContentTypeHTML = 2 }; static const char *contenttypes[] = { "", "plain", "html" }; static const int FieldSeparator = '\t'; /* output field seperator character */ static const char *baseurl = ""; -enum { +enum TagId { TagUnknown = 0, /* RSS */ RSSTagDcdate, RSSTagPubdate, RSSTagTitle, @@ -59,9 +59,9 @@ typedef struct feeditem { } FeedItem; typedef struct feedtag { - char *name; - size_t namelen; - int id; + char *name; + size_t namelen; + enum TagId id; } FeedTag; typedef struct feedcontext { @@ -75,7 +75,7 @@ typedef struct feedcontext { int attrcount; } FeedContext; -static int gettag(int, const char *, size_t); +static enum TagId gettag(enum FeedType, const char *, size_t); static int gettimetz(const char *, char *, size_t, int *); static int isattr(const char *, size_t, const char *, size_t); static int istag(const char *, size_t, const char *, size_t); @@ -85,7 +85,8 @@ static void string_append(String *, const char *, size_t); static void string_buffer_init(String *, size_t); static void string_buffer_realloc(String *, size_t); static void string_clear(String *); -static void string_print(String *); +static void string_print_encoded(String *); +static void string_print_trimmed(String *); static void xml_handler_attr(XMLParser *, const char *, size_t, const char *, size_t, const char *, size_t); static void xml_handler_attr_start(XMLParser *, const char *, size_t, @@ -104,8 +105,8 @@ static FeedContext ctx; static XMLParser parser; /* XML parser state */ /* unique number for parsed tag (faster comparison) */ -static int -gettag(int feedtype, const char *name, size_t namelen) +static enum TagId +gettag(enum FeedType feedtype, const char *name, size_t namelen) { /* RSS, alphabetical order */ static FeedTag rsstag[] = { @@ -138,24 +139,29 @@ gettag(int feedtype, const char *name, size_t namelen) if (namelen < 2 || namelen > 15) return TagUnknown; - if (feedtype == FeedTypeRSS) { + switch (feedtype) { + case FeedTypeRSS: for (i = 0; rsstag[i].name; i++) { if (!(n = strncasecmp(rsstag[i].name, name, rsstag[i].namelen))) - return rsstag[i].id; + return rsstag[i].id; /* found */ /* optimization: it's sorted so nothing after it matches. */ if (n > 0) return TagUnknown; } - } else if (feedtype == FeedTypeAtom) { + break; + case FeedTypeAtom: for (i = 0; atomtag[i].name; i++) { if (!(n = strncasecmp(atomtag[i].name, name, atomtag[i].namelen))) - return atomtag[i].id; + return atomtag[i].id; /* found */ /* optimization: it's sorted so nothing after it matches. */ if (n > 0) return TagUnknown; } + break; + default: + return TagUnknown; } - return TagUnknown; + return TagUnknown; /* NOTREACHED */ } /* clear string only; don't free, prevents unnecessary reallocation */ @@ -334,23 +340,26 @@ parsetime(const char *s, char *buf, size_t bufsiz, time_t *tp) return -1; } -/* print text, escape tabs, newline and carriage return etc */ +/* Print text, encode TABs, newlines and '\', remove other whitespace. + * Remove leading and trailing whitespace. */ static void -string_print(String *s) +string_print_encoded(String *s) { const char *p, *e; /* skip leading whitespace */ - p = trimstart(s->data); - e = trimend(p); + for (p = s->data; *p && isspace((int)*p); p++) + ; + /* seek offset of trailing whitespace */ + for (e = p + strlen(p); e > p && isspace((int)*(e - 1)); e--) + ; for (; *p && p != e; p++) { - /* isspace(c) && c != ' '. */ - if (((unsigned)*p - '\t') < 5) { - switch(*p) { - case '\n': fputs("\\n", stdout); break; + if (isspace((int)*p) && *p != ' ') { + switch (*p) { + case '\n': fputs("\\n", stdout); break; case '\\': fputs("\\\\", stdout); break; - case '\t': fputs("\\t", stdout); break; + case '\t': fputs("\\t", stdout); break; default: break; /* ignore other whitespace chars */ } } else if (!iscntrl((int)*p)) { /* ignore control chars */ @@ -359,6 +368,29 @@ string_print(String *s) } } +/* Print text, replace TABs, carriage return and other whitespace with ' '. + * Other control chars are removed. Remove leading and trailing whitespace. */ +static void +string_print_trimmed(String *s) +{ + const char *p, *e; + + /* skip leading whitespace */ + for (p = s->data; *p && isspace((int)*p); p++) + ; + /* seek offset of trailing whitespace */ + for (e = p + strlen(p); e > p && isspace((int)*(e - 1)); e--) + ; + + for (; *p && p != e; p++) { + if (isspace((int)*p)) + putchar(' '); + else if (!iscntrl((int)*p)) + /* ignore other control chars */ + putchar((int)*p); + } +} + static void printfields(void) { @@ -376,19 +408,19 @@ printfields(void) if (r != -1) fputs(timebuf, stdout); putchar(FieldSeparator); - string_print(&ctx.item.title); + string_print_trimmed(&ctx.item.title); putchar(FieldSeparator); /* always print absolute urls */ if (absuri(ctx.item.link.data, baseurl, link, sizeof(link)) != -1) fputs(link, stdout); putchar(FieldSeparator); - string_print(&ctx.item.content); + string_print_encoded(&ctx.item.content); putchar(FieldSeparator); fputs(contenttypes[ctx.item.contenttype], stdout); putchar(FieldSeparator); - string_print(&ctx.item.id); + string_print_trimmed(&ctx.item.id); putchar(FieldSeparator); - string_print(&ctx.item.author); + string_print_trimmed(&ctx.item.author); putchar(FieldSeparator); fputs(feedtypes[ctx.item.feedtype], stdout); putchar('\n'); @@ -555,53 +587,59 @@ xml_handler_start_element(XMLParser *p, const char *name, size_t namelen) /* tag already set: return */ if (ctx.tag[0] != '\0') return; + /* in item */ strlcpy(ctx.tag, name, sizeof(ctx.tag)); /* NOTE: truncation ignored */ ctx.taglen = namelen; ctx.tagid = gettag(ctx.item.feedtype, ctx.tag, ctx.taglen); - if (ctx.tagid == TagUnknown) - ctx.field = NULL; - if (ctx.item.feedtype == FeedTypeRSS) { - if (ctx.tagid == RSSTagPubdate || ctx.tagid == RSSTagDcdate) + switch (ctx.tagid) { + case RSSTagPubdate: + case RSSTagDcdate: + ctx.field = &ctx.item.timestamp; + break; + case AtomTagPublished: + case AtomTagUpdated: + /* prefer published over updated if set */ + if (ctx.tagid != AtomTagUpdated || !ctx.item.timestamp.len) { ctx.field = &ctx.item.timestamp; - else if (ctx.tagid == RSSTagTitle) - ctx.field = &ctx.item.title; - else if (ctx.tagid == RSSTagLink) - ctx.field = &ctx.item.link; - else if (ctx.tagid == RSSTagDescription || - ctx.tagid == RSSTagContentencoded) { - /* ignore, prefer content:encoded over description */ - if (ctx.tagid != RSSTagDescription || !ctx.item.content.len) { - ctx.iscontenttag = 1; - ctx.field = &ctx.item.content; - } - } else if (ctx.tagid == RSSTagGuid) { - ctx.field = &ctx.item.id; - } else if (ctx.tagid == RSSTagAuthor || ctx.tagid == RSSTagDccreator) { - ctx.field = &ctx.item.author; } - } else if (ctx.item.feedtype == FeedTypeAtom) { - if (ctx.tagid == AtomTagPublished || ctx.tagid == AtomTagUpdated) { - /* ignore, prefer published over updated */ - if (ctx.tagid != AtomTagUpdated || !ctx.item.timestamp.len) { - ctx.field = &ctx.item.timestamp; - } - } else if (ctx.tagid == AtomTagTitle) { - ctx.field = &ctx.item.title; - } else if (ctx.tagid == AtomTagSummary || ctx.tagid == AtomTagContent) { - /* ignore, prefer content:encoded over description */ - if (ctx.tagid != AtomTagSummary || !ctx.item.content.len) { - ctx.iscontenttag = 1; - ctx.field = &ctx.item.content; - } - } else if (ctx.tagid == AtomTagId) { - ctx.field = &ctx.item.id; - } else if (ctx.tagid == AtomTagLink) { - ctx.field = &ctx.item.link; - } else if (ctx.tagid == AtomTagAuthor) { - ctx.field = &ctx.item.author; + break; + case RSSTagTitle: + case AtomTagTitle: + ctx.field = &ctx.item.title; + break; + case RSSTagLink: + case AtomTagLink: + ctx.field = &ctx.item.link; + break; + case RSSTagDescription: + case RSSTagContentencoded: + /* prefer content:encoded over description if set */ + if (ctx.tagid != RSSTagDescription || !ctx.item.content.len) { + ctx.iscontenttag = 1; + ctx.field = &ctx.item.content; } + break; + case AtomTagSummary: + case AtomTagContent: + /* prefer content over summary if set */ + if (ctx.tagid != AtomTagSummary || !ctx.item.content.len) { + ctx.iscontenttag = 1; + ctx.field = &ctx.item.content; + } + break; + case RSSTagGuid: + case AtomTagId: + ctx.field = &ctx.item.id; + break; + case RSSTagAuthor: + case RSSTagDccreator: + case AtomTagAuthor: + ctx.field = &ctx.item.author; + break; + default: + ctx.field = NULL; } /* clear field */ if (ctx.field) diff --git a/sfeed_frames.c b/sfeed_frames.c @@ -83,11 +83,11 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) /* menu if not unnamed */ if (f->name[0]) { fputs("<h2 id=\"", fpitems); - printxmlencoded(f->name, fpitems); + print(f->name, fpitems, xmlencode); fputs("\"><a href=\"#", fpitems); - printxmlencoded(f->name, fpitems); + print(f->name, fpitems, xmlencode); fputs("\">", fpitems); - printxmlencoded(f->name, fpitems); + print(f->name, fpitems, xmlencode); fputs("</a></h2>\n", fpitems); } @@ -108,11 +108,14 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /></head>\n" "<body class=\"frame\"><div class=\"content\">" "<h2><a href=\"", fpcontent); - printxmlencoded(fields[FieldLink], fpcontent); + print(fields[FieldLink], fpcontent, xmlencode); fputs("\">", fpcontent); - printxmlencoded(fields[FieldTitle], fpcontent); + print(fields[FieldTitle], fpcontent, xmlencode); fputs("</a></h2>", fpcontent); - printcontent(fields[FieldContent], fpcontent); + /* NOTE: this prints the raw HTML of the feed, this is + * potentially dangerous, it is up to the user / browser + * to trust a feed it's HTML content. */ + decodefield(fields[FieldContent], fpcontent, fputc); fputs("</div></body></html>", fpcontent); fclose(fpcontent); } @@ -141,7 +144,7 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) fputs("<a href=\"", fpitems); fputs(filepath, fpitems); fputs("\" target=\"content\">", fpitems); - printxmlencoded(fields[FieldTitle], fpitems); + print(fields[FieldTitle], fpitems, xmlencode); fputs("</a>", fpitems); if (isnew) fputs("</u></b>", fpitems); @@ -210,11 +213,11 @@ main(int argc, char *argv[]) fputs("<a class=\"n\" href=\"items.html#", fpmenu); else fputs("<a href=\"items.html#", fpmenu); - printxmlencoded(f->name, fpmenu); + print(f->name, fpmenu, xmlencode); fputs("\" target=\"items\">", fpmenu); if (f->totalnew > 0) fputs("<b><u>", fpmenu); - printxmlencoded(f->name, fpmenu); + print(f->name, fpmenu, xmlencode); fprintf(fpmenu, " (%lu)", f->totalnew); if (f->totalnew > 0) fputs("</u></b>", fpmenu); diff --git a/sfeed_html.c b/sfeed_html.c @@ -24,11 +24,11 @@ printfeed(FILE *fp, struct feed *f) if (f->name[0] != '\0') { fputs("<h2 id=\"", stdout); - printxmlencoded(f->name, stdout); + print(f->name, stdout, xmlencode); fputs("\"><a href=\"#", stdout); - printxmlencoded(f->name, stdout); + print(f->name, stdout, xmlencode); fputs("\">", stdout); - printxmlencoded(f->name, stdout); + print(f->name, stdout, xmlencode); fputs("</a></h2>\n", stdout); } fputs("<table cellpadding=\"0\" cellspacing=\"0\">\n", stdout); @@ -53,10 +53,10 @@ printfeed(FILE *fp, struct feed *f) fputs("<b><u>", stdout); if (islink) { fputs("<a href=\"", stdout); - printxmlencoded(fields[FieldLink], stdout); + print(fields[FieldLink], stdout, xmlencode); fputs("\">", stdout); } - printxmlencoded(fields[FieldTitle], stdout); + print(fields[FieldTitle], stdout, xmlencode); if (islink) fputs("</a>", stdout); if (isnew) @@ -126,11 +126,11 @@ main(int argc, char *argv[]) fputs("<li class=\"n\"><a href=\"#", stdout); else fputs("<li><a href=\"#", stdout); - printxmlencoded(f->name, stdout); + print(f->name, stdout, xmlencode); fputs("\">", stdout); if (f->totalnew > 0) fputs("<b><u>", stdout); - printxmlencoded(f->name, stdout); + print(f->name, stdout, xmlencode); fprintf(stdout, " (%lu)", f->totalnew); if (f->totalnew > 0) fputs("</u></b>", stdout); diff --git a/sfeed_mbox.c b/sfeed_mbox.c @@ -60,12 +60,17 @@ printfeed(FILE *fp, const char *feedname) fields[FieldContentType], feedname); if (!strcmp(fields[FieldContentType], "html")) { - printf("<p>Link: <a href=\"%s\">%s</a></p>\n\n", - fields[FieldLink], fields[FieldLink]); - printcontent(fields[FieldContent], stdout); + fputs("<p>Link: <a href=\"", stdout); + decodefield(fields[FieldLink], stdout, fputc); + fputs("\">", stdout); + decodefield(fields[FieldLink], stdout, fputc); + fputs("</a></p>\n\n", stdout); + decodefield(fields[FieldContent], stdout, fputc); } else { - printf("Link: %s\n\n", fields[FieldLink]); - printcontent(fields[FieldContent], stdout); + fputs("Link: ", stdout); + decodefield(fields[FieldLink], stdout, fputc); + fputs("\n\n", stdout); + decodefield(fields[FieldContent], stdout, fputc); } fputs("\n\n", stdout); } diff --git a/sfeed_plain.c b/sfeed_plain.c @@ -1,8 +1,10 @@ +#include <ctype.h> #include <err.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> +#include <wchar.h> #include "util.h" @@ -10,6 +12,32 @@ static time_t comparetime; static char *line = NULL; static size_t size = 0; +/* print `len' columns of characters. If string is shorter pad the rest + * with characters `pad`. */ +static void +printutf8pad(FILE *fp, const char *s, size_t len, int pad) +{ + wchar_t w; + size_t n = 0, i; + int r; + + for (i = 0; *s && n < len; i++, s++) { + /* skip control characters */ + if (iscntrl(*s)) + continue; + if (ISUTF8(*s)) { + if ((r = mbtowc(&w, s, 4)) == -1) + break; + if ((r = wcwidth(w)) == -1) + r = 1; + n += (size_t)r; + } + putc(*s, fp); + } + for (; n < len; n++) + putc(pad, fp); +} + static void printfeed(FILE *fp, const char *feedname) { @@ -27,9 +55,7 @@ printfeed(FILE *fp, const char *feedname) printf("%-15.15s ", feedname); printf(" %-30.30s ", fields[FieldTimeFormatted]); printutf8pad(stdout, fields[FieldTitle], 70, ' '); - fputs(" ", stdout); - fputs(fields[FieldLink], stdout); - putchar('\n'); + printf(" %s\n", fields[FieldLink]); } } diff --git a/util.c b/util.c @@ -10,7 +10,6 @@ #include <stdlib.h> #include <string.h> #include <time.h> -#include <wchar.h> #include "util.h" @@ -73,7 +72,8 @@ readpath: return strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path) ? -1 : 0; } -/* get absolute uri; if `link` is relative use `base` to make it absolute. */ +/* get absolute uri; if `link` is relative use `base` to make it absolute. + * the returned string in `buf` is uri encoded, see: encodeuri(). */ int absuri(const char *link, const char *base, char *buf, size_t bufsiz) { @@ -185,63 +185,6 @@ parseline(char **line, size_t *size, char **fields, return (int)i; } -char * -trimend(const char *s) -{ - size_t len = strlen(s); - - for (; len > 0 && isspace((int)s[len - 1]); len--) - ; - return (char*)&s[len]; -} - -char * -trimstart(const char *s) -{ - for (; *s && isspace((int)*s); s++) - ; - return (char *)s; -} - -void -printxmlencoded(const char *s, FILE *fp) -{ - for (; *s; s++) { - switch(*s) { - case '<': fputs("&lt;", fp); break; - case '>': fputs("&gt;", fp); break; - case '\'': fputs("&apos;", fp); break; - case '&': fputs("&amp;", fp); break; - case '"': fputs("&quot;", fp); break; - default: - fputc((int)*s, fp); - } - } -} - -/* print `len` columns of characters. If string is shorter pad the rest - * with characters `pad`. */ -void -printutf8pad(FILE *fp, const char *s, size_t len, int pad) -{ - wchar_t w; - size_t n = 0, i; - int r; - - for (i = 0; *s && n < len; i++, s++) { - if (ISUTF8(*s)) { - if ((r = mbtowc(&w, s, 4)) == -1) - break; - if ((r = wcwidth(w)) == -1) - r = 1; - n += (size_t)r; - } - putc(*s, fp); - } - for (; n < len; n++) - putc(pad, fp); -} - /* parse time to time_t, assumes time_t is signed */ int strtotime(const char *s, time_t *t) @@ -257,27 +200,47 @@ strtotime(const char *s, time_t *t) return 0; } -/* print text, ignore tabs, newline and carriage return etc - * print some HTML 2.0 / XML 1.0 as normal text */ void -printcontent(const char *s, FILE *fp) +print(const char *s, FILE *fp, int (*fn)(int, FILE *)) { - const char *p; + for (; *s; s++) + fn((int)*s, fp); +} - for (p = s; *p; p++) { - if (*p == '\\') { - switch (*(++p)) { - case '\\': fputc('\\', fp); break; - case 't': fputc('\t', fp); break; - case 'n': fputc('\n', fp); break; - default: fputc(*p, fp); +/* unescape / decode fields printed by string_print_encode() + * "\\" to "\", "\t", to TAB, "\n" to newline. Unrecognised escape sequences + * are ignored: "\z" etc. Call `fn` on each escaped character. */ +void +decodefield(const char *s, FILE *fp, int (*fn)(int, FILE *)) +{ + for (; *s; s++) { + if (*s == '\\') { + switch (*(++s)) { + case '\\': fn('\\', fp); break; + case 't': fn('\t', fp); break; + case 'n': fn('\n', fp); break; + case '\0': return; } } else { - fputc(*p, fp); + fn((int)*s, fp); } } } +/* print some HTML 2.0 / XML 1.0 as normal text */ +int +xmlencode(int c, FILE *fp) +{ + switch(c) { + case '<': return fputs("&lt;", fp); + case '>': return fputs("&gt;", fp); + case '\'': return fputs("&apos;", fp); + case '&': return fputs("&amp;", fp); + case '"': return fputs("&quot;", fp); + } + return fputc(c, fp); +} + /* Some implementations of basename(3) return a pointer to a static * internal buffer (OpenBSD). Others modify the contents of `path` (POSIX). * This is a wrapper function that is compatible with both versions. diff --git a/util.h b/util.h @@ -26,15 +26,13 @@ enum { FieldUnixTimestamp = 0, FieldTimeFormatted, FieldTitle, FieldLink, FieldLast }; int absuri(const char *, const char *, char *, size_t); +void decodefield(const char *, FILE *, int (*)(int, FILE *)); int encodeuri(const char *, char *, size_t); int parseline(char **, size_t *, char **, unsigned int, int, FILE *); int parseuri(const char *, struct uri *, int); -void printcontent(const char *, FILE *); -void printxmlencoded(const char *, FILE *); -void printutf8pad(FILE *, const char *, size_t, int); +void print(const char *, FILE *, int (*)(int, FILE *)); int strtotime(const char *, time_t *); -char * trimstart(const char *); -char * trimend(const char *); char * xbasename(const char *); +int xmlencode(int, FILE *);