sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit 13927fc6083c3d134e456ccfafb953c6cea17662
parent fc6c2a381742aba4deaf8538fa2c85750361a2d9
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 27 Feb 2016 16:21:30 +0100

various improvements

- pledge tools and add define to enable it on platforms that support it, currently
  only OpenBSD 5.9+
- separate getline and parseline functionality.
- use murmur3 hash instead of jenkins1: faster and less collisions.
- make some error messages a bit more clear, for example with path truncation.
- some small cleanups, move printutf8pad to util.

Diffstat:
Mconfig.mk | 3+++
Msfeed.c | 3+++
Msfeed_frames.c | 9+++++++--
Msfeed_html.c | 7++++++-
Msfeed_mbox.c | 27+++++++++------------------
Msfeed_opml_import.c | 4++++
Msfeed_plain.c | 31++++++-------------------------
Msfeed_tail.c | 92++++++++++++++++++++-----------------------------------------------------------
Msfeed_web.c | 7+++++--
Msfeed_xmlenc.c | 5+++++
Mutil.c | 101++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
Mutil.h | 12+++++++++++-
12 files changed, 175 insertions(+), 126 deletions(-)

diff --git a/config.mk b/config.mk @@ -22,5 +22,8 @@ LDFLAGS = -s ${LIBS} # -D_POSIX_C_SOURCE=200809L -D_XOPEN_SOURCE=700 -D_BSD_SOURCE #LDFLAGS = -static -s ${LIBS} +# OpenBSD 5.9+: use pledge(2) +#CFLAGS += -DUSE_PLEDGE + # compiler and linker #CC = cc diff --git a/sfeed.c b/sfeed.c @@ -709,6 +709,9 @@ xml_handler_end_el(XMLParser *p, const char *name, size_t namelen, int isshort) int main(int argc, char *argv[]) { + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + if (argc > 1) baseurl = argv[1]; diff --git a/sfeed_frames.c b/sfeed_frames.c @@ -138,13 +138,15 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) } fputs("<table cellpadding=\"0\" cellspacing=\"0\">\n", fpitems); - while (parseline(&line, &linesize, fields, fpin) > 0) { + while (getline(&line, &linesize, fpin) > 0) { + if (!parseline(line, fields)) + break; /* write content */ if (!(namelen = normalizepath(fields[FieldTitle], name, sizeof(name)))) continue; r = snprintf(filepath, sizeof(filepath), "%s/%s.html", dirpath, name); if (r == -1 || (size_t)r >= sizeof(filepath)) - errx(1, "snprintf: path truncation"); + errx(1, "snprintf: path truncation: '%s/%s.html'", dirpath, name); /* content file doesn't exist yet and has write access */ if (access(filepath, F_OK) != 0) { @@ -215,6 +217,9 @@ main(int argc, char *argv[]) int i; struct feed *f; + if (pledge("stdio rpath wpath cpath", NULL) == -1) + err(1, "pledge"); + if (!(feeds = calloc(argc, sizeof(struct feed *)))) err(1, "calloc"); diff --git a/sfeed_html.c b/sfeed_html.c @@ -31,7 +31,9 @@ printfeed(FILE *fp, struct feed *f) } fputs("<table cellpadding=\"0\" cellspacing=\"0\">\n", stdout); - while (parseline(&line, &linesize, fields, fp) > 0) { + while (getline(&line, &linesize, fp) > 0) { + if (!parseline(line, fields)) + break; parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); @@ -73,6 +75,9 @@ main(int argc, char *argv[]) FILE *fp; int i; + if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1) + err(1, "pledge"); + if (!(feeds = calloc(argc, sizeof(struct feed *)))) err(1, "calloc"); if ((comparetime = time(NULL)) == -1) diff --git a/sfeed_mbox.c b/sfeed_mbox.c @@ -15,22 +15,7 @@ static char *line; static size_t linesize; static char host[256], *user, mtimebuf[32]; -/* jenkins one-at-a-time hash, used for Message-Id */ -static uint32_t -jenkins1(const char *s) -{ - uint32_t hash = 0; - - for (; *s; s++) { - hash += (int)*s; - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - - return hash + (hash << 15); -} +static const uint32_t seed = 0x45931287; /* Unescape / decode fields printed by string_print_encoded() * "\\" to "\", "\t", to TAB, "\n" to newline. Unrecognised escape sequences @@ -79,8 +64,11 @@ printfeed(FILE *fp, const char *feedname) struct tm tm; char *fields[FieldLast], timebuf[32]; time_t parsedtime; + ssize_t linelen; - while (parseline(&line, &linesize, fields, fp) > 0) { + while ((linelen = getline(&line, &linesize, fp)) > 0) { + if (!parseline(line, fields)) + break; parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); /* can't convert: default to formatted time for time_t 0. */ @@ -106,7 +94,7 @@ printfeed(FILE *fp, const char *feedname) user, user, host, fields[FieldTitle], fields[FieldUnixTimestamp], fields[FieldUnixTimestamp][0] ? "." : "", - jenkins1(fields[FieldTitle]), + murmur3_32(line, (size_t)linelen, seed), feedname[0] ? feedname : "unnamed", fields[FieldContentType], feedname); @@ -134,6 +122,9 @@ main(int argc, char *argv[]) char *name; int i; + if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1) + err(1, "pledge"); + if (!(user = getenv("USER"))) user = "you"; if (gethostname(host, sizeof(host)) == -1) diff --git a/sfeed_opml_import.c b/sfeed_opml_import.c @@ -1,4 +1,5 @@ #include <ctype.h> +#include <err.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> @@ -84,6 +85,9 @@ xml_handler_attrentity(XMLParser *p, const char *tag, size_t taglen, int main(void) { + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + parser.xmlattr = xml_handler_attr; parser.xmlattrentity = xml_handler_attrentity; parser.xmltagend = xml_handler_end_element; diff --git a/sfeed_plain.c b/sfeed_plain.c @@ -4,7 +4,6 @@ #include <stdlib.h> #include <string.h> #include <time.h> -#include <wchar.h> #include "util.h" @@ -12,36 +11,15 @@ static time_t comparetime; static char *line; static size_t linesize; -/* print `len' columns of characters. If string is shorter pad the rest - * with characters `pad`. */ -static void -printutf8pad(FILE *fp, const char *s, size_t len, int pad) -{ - wchar_t w; - size_t n = 0, i; - int r; - - for (i = 0; *s && n < len; i++, s++) { - if (ISUTF8(*s)) { - if ((r = mbtowc(&w, s, 4)) == -1) - break; - if ((r = wcwidth(w)) == -1) - r = 1; - n += (size_t)r; - } - putc(*s, fp); - } - for (; n < len; n++) - putc(pad, fp); -} - static void printfeed(FILE *fp, const char *feedname) { char *fields[FieldLast]; time_t parsedtime; - while (parseline(&line, &linesize, fields, fp) > 0) { + while (getline(&line, &linesize, fp) > 0) { + if (!parseline(line, fields)) + break; parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); @@ -65,6 +43,9 @@ main(int argc, char *argv[]) char *name; int i; + if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1) + err(1, "pledge"); + if ((comparetime = time(NULL)) == -1) err(1, "time"); /* 1 day is old news */ diff --git a/sfeed_tail.c b/sfeed_tail.c @@ -27,65 +27,7 @@ struct bucket { static struct bucket *buckets; static struct bucket *bucket; -static char * -estrdup(const char *s) -{ - char *p; - - if (!(p = strdup(s))) - err(1, "strdup"); - return p; -} - -static void * -ecalloc(size_t nmemb, size_t size) -{ - void *p; - - if (!(p = calloc(nmemb, size))) - err(1, "calloc"); - return p; -} - -/* jenkins one-at-a-time hash */ -static uint32_t -jenkins1(const char *s) -{ - uint32_t hash = 0; - - for (; *s; s++) { - hash += (int)*s; - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - - return hash + (hash << 15); -} - -/* print `len' columns of characters. If string is shorter pad the rest - * with characters `pad`. */ -static void -printutf8pad(FILE *fp, const char *s, size_t len, int pad) -{ - wchar_t w; - size_t n = 0, i; - int r; - - for (i = 0; *s && n < len; i++, s++) { - if (ISUTF8(*s)) { - if ((r = mbtowc(&w, s, 4)) == -1) - break; - if ((r = wcwidth(w)) == -1) - r = 1; - n += (size_t)r; - } - putc(*s, fp); - } - for (; n < len; n++) - putc(pad, fp); -} +static const uint32_t seed = 0x45931287; static void printfeed(FILE *fp, const char *feedname) @@ -94,29 +36,37 @@ printfeed(FILE *fp, const char *feedname) char *fields[FieldLast]; uint32_t hash; int uniq; + ssize_t n; + + while ((n = getline(&line, &linesize, fp)) > 0) { + if (line[n] == '\n') + line[--n] = '\0'; + hash = murmur3_32(line, n, seed) % BUCKET_SIZE; - while (parseline(&line, &linesize, fields, fp) > 0) { - hash = (jenkins1(fields[FieldUnixTimestamp]) + - jenkins1(fields[FieldId])) % BUCKET_SIZE; for (uniq = 1, match = &(bucket->cols[hash]); match; match = match->next) { /* check for collision, can still be unique. */ - if (match->id && !strcmp(match->id, fields[FieldId]) && - match->timestamp && !strcmp(match->timestamp, fields[FieldUnixTimestamp])) { + if (match->s && match->len == (size_t)n && + !strcmp(line, match->s)) { uniq = 0; break; } /* nonexistent or no collision */ if (!match->next) { - match = match->next = ecalloc(1, sizeof(struct line)); - match->id = estrdup(fields[FieldId]); - match->timestamp = estrdup(fields[FieldUnixTimestamp]); - break; + if (!(match = match->next = calloc(1, sizeof(struct line)))) + err(1, "calloc"); + if (!(match->s = strdup(line))) + err(1, "strdup"); + match->len = (size_t)n; + break; } } + if (!uniq || firsttime) continue; + if (!parseline(line, fields)) + break; if (feedname[0]) printf("%-15.15s %-30.30s", feedname, fields[FieldTimeFormatted]); @@ -132,7 +82,11 @@ main(int argc, char *argv[]) FILE *fp; int i; - bucket = buckets = ecalloc(argc, sizeof(struct bucket)); + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + + if (!(bucket = buckets = calloc(argc, sizeof(struct bucket)))) + err(1, "calloc"); for (firsttime = (argc > 1); ; firsttime = 0) { if (argc == 1) { printfeed(stdin, ""); diff --git a/sfeed_web.c b/sfeed_web.c @@ -1,4 +1,5 @@ #include <ctype.h> +#include <err.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> @@ -19,10 +20,9 @@ static char abslink[4096], feedlink[4096], basehref[4096], feedtype[256]; static void printfeedtype(const char *s, FILE *fp) { - for (; *s; s++) { + for (; *s; s++) if (!isspace((int)*s)) fputc(*s, fp); - } } static void @@ -90,6 +90,9 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, int main(int argc, char *argv[]) { + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + if (argc > 1) strlcpy(basehref, argv[1], sizeof(basehref)); diff --git a/sfeed_xmlenc.c b/sfeed_xmlenc.c @@ -1,10 +1,12 @@ #include <ctype.h> +#include <err.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <strings.h> +#include "util.h" #include "xml.h" static XMLParser parser; @@ -57,6 +59,9 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, int main(void) { + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + parser.xmlattr = xmlattr; parser.xmltagend = xmltagend; parser.xmltagstart = xmltagstart; diff --git a/util.c b/util.c @@ -7,12 +7,25 @@ #include <limits.h> #include <stdarg.h> #include <stdio.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> #include <time.h> +#include <wchar.h> #include "util.h" +#ifndef USE_PLEDGE +int +pledge(const char *promises, const char *paths[]) +{ + (void)promises; + (void)paths; + + return 0; +} +#endif + static void encodehex(unsigned char c, char *s) { @@ -130,7 +143,7 @@ absuri(const char *link, const char *base, char *buf, size_t bufsiz) port[0] ? ":" : "", port); if (r == -1 || (size_t)r >= sizeof(tmp)) - return -1; + return -1; /* error or truncation */ /* relative to root */ if (!ulink.host[0] && ulink.path[0] != '/') { @@ -191,16 +204,13 @@ encodeuri(const char *s, char *buf, size_t bufsiz) * 'line' buffer is allocated using malloc, 'size' will contain the allocated * buffer size. * returns: amount of fields read (>0) or -1 on error. */ -ssize_t -parseline(char **line, size_t *size, char *fields[FieldLast], FILE *fp) +size_t +parseline(char *line, char *fields[FieldLast]) { char *prev, *s; size_t i; - if (getline(line, size, fp) <= 0) - return -1; - - for (prev = *line, i = 0; + for (prev = line, i = 0; (s = strchr(prev, '\t')) && i < FieldLast - 1; i++) { *s = '\0'; @@ -212,7 +222,7 @@ parseline(char **line, size_t *size, char *fields[FieldLast], FILE *fp) for (; i < FieldLast; i++) fields[i] = ""; - return (ssize_t)i; + return i; } /* Parse time to time_t, assumes time_t is signed. */ @@ -267,3 +277,78 @@ xbasename(const char *path) free(p); return b; } + +/* print `len' columns of characters. If string is shorter pad the rest + * with characters `pad`. */ +void +printutf8pad(FILE *fp, const char *s, size_t len, int pad) +{ + wchar_t w; + size_t n = 0, i; + int r; + + for (i = 0; *s && n < len; i++, s++) { + if (ISUTF8(*s)) { + if ((r = mbtowc(&w, s, 4)) == -1) + break; + if ((r = wcwidth(w)) == -1) + r = 1; + n += (size_t)r; + } + putc(*s, fp); + } + for (; n < len; n++) + putc(pad, fp); +} + +uint32_t +murmur3_32(const char *key, uint32_t len, uint32_t seed) +{ + static const uint32_t c1 = 0xcc9e2d51; + static const uint32_t c2 = 0x1b873593; + static const uint32_t r1 = 15; + static const uint32_t r2 = 13; + static const uint32_t m = 5; + static const uint32_t n = 0xe6546b64; + uint32_t hash = seed; + const int nblocks = len / 4; + const uint32_t *blocks = (const uint32_t *) key; + int i; + uint32_t k, k1; + const uint8_t *tail; + + for (i = 0; i < nblocks; i++) { + k = blocks[i]; + k *= c1; + k = ROT32(k, r1); + k *= c2; + + hash ^= k; + hash = ROT32(hash, r2) * m + n; + } + tail = (const uint8_t *) (key + nblocks * 4); + + k1 = 0; + switch (len & 3) { + case 3: + k1 ^= tail[2] << 16; + case 2: + k1 ^= tail[1] << 8; + case 1: + k1 ^= tail[0]; + + k1 *= c1; + k1 = ROT32(k1, r1); + k1 *= c2; + hash ^= k1; + } + + hash ^= len; + hash ^= (hash >> 16); + hash *= 0x85ebca6b; + hash ^= (hash >> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >> 16); + + return hash; +} diff --git a/util.h b/util.h @@ -31,8 +31,18 @@ enum { int absuri(const char *, const char *, char *, size_t); int encodeuri(const char *, char *, size_t); -ssize_t parseline(char **, size_t *, char *[FieldLast], FILE *); +size_t parseline(char *, char *[FieldLast]); int parseuri(const char *, struct uri *, int); +void printutf8pad(FILE *, const char *, size_t, int); int strtotime(const char *, time_t *); char * xbasename(const char *); void xmlencode(const char *, FILE *); + +#ifdef USE_PLEDGE +#include <unistd.h> +#else +int pledge(const char *, const char *[]); +#endif + +#define ROT32(x, y) ((x << y) | (x >> (32 - y))) +uint32_t murmur3_32(const char *, uint32_t, uint32_t);