sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit e92d374581061255f6fdb70c68843bc077c33825
parent 847a78083f4437d5110e3877fe8f57e7da2a40ae
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Tue, 29 Mar 2016 10:21:06 +0200

add time parsing to sfeed itself, remove time field

- less overhead (we only need GMT time) so no setenv("TZ", ...) tzset() crap.
- timezone format (for example %z in strptime) is non-standard,

this will add some lines of code and some complexity to our code though, but
the trade-off is worth it imho.

Diffstat:
Msfeed.c | 417++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Msfeed_frames.c | 10+++++++++-
Msfeed_html.c | 8+++++++-
Msfeed_plain.c | 10++++++++--
Msfeed_tail.c | 15+++++++++++++--
5 files changed, 290 insertions(+), 170 deletions(-)

diff --git a/sfeed.c b/sfeed.c @@ -85,30 +85,31 @@ typedef struct feedcontext { int attrcount; /* count item HTML element attributes */ } FeedContext; +static long long datetounix(long long, int, int, int, int, int); static enum TagId gettag(enum FeedType, const char *, size_t); -static int gettimetz(const char *, int *); -static int isattr(const char *, size_t, const char *, size_t); -static int istag(const char *, size_t, const char *, size_t); -static int parsetime(const char *, time_t *); -static void printfields(void); -static void string_append(String *, const char *, size_t); -static void string_buffer_realloc(String *, size_t); -static void string_clear(String *); -static void string_print_encoded(String *); -static void string_print_trimmed(String *); -static void xml_handler_attr(XMLParser *, const char *, size_t, - const char *, size_t, const char *, size_t); -static void xml_handler_attr_end(XMLParser *, const char *, size_t, +static long long gettzoffset(const char *); +static int isattr(const char *, size_t, const char *, size_t); +static int istag(const char *, size_t, const char *, size_t); +static int parsetime(const char *, time_t *); +static void printfields(void); +static void string_append(String *, const char *, size_t); +static void string_buffer_realloc(String *, size_t); +static void string_clear(String *); +static void string_print_encoded(String *); +static void string_print_trimmed(String *); +static void xml_handler_attr(XMLParser *, const char *, size_t, + const char *, size_t, const char *, size_t); +static void xml_handler_attr_end(XMLParser *, const char *, size_t, + const char *, size_t); +static void xml_handler_attr_start(XMLParser *, const char *, size_t, const char *, size_t); -static void xml_handler_attr_start(XMLParser *, const char *, size_t, - const char *, size_t); -static void xml_handler_cdata(XMLParser *, const char *, size_t); -static void xml_handler_data(XMLParser *, const char *, size_t); -static void xml_handler_data_entity(XMLParser *, const char *, size_t); -static void xml_handler_end_el(XMLParser *, const char *, size_t, int); -static void xml_handler_start_el(XMLParser *, const char *, size_t); -static void xml_handler_start_el_parsed(XMLParser *, const char *, - size_t, int); +static void xml_handler_cdata(XMLParser *, const char *, size_t); +static void xml_handler_data(XMLParser *, const char *, size_t); +static void xml_handler_data_entity(XMLParser *, const char *, size_t); +static void xml_handler_end_el(XMLParser *, const char *, size_t, int); +static void xml_handler_start_el(XMLParser *, const char *, size_t); +static void xml_handler_start_el_parsed(XMLParser *, const char *, + size_t, int); /* map tag name to tagid */ /* RSS, alphabetical order */ @@ -166,7 +167,7 @@ static int fieldmap[TagLast] = { [AtomTagAuthor] = FeedFieldAuthor }; -static const int FieldSeparator = '\t'; /* output field seperator character */ +static const int FieldSeparator = '\t'; static const char *baseurl = ""; static FeedContext ctx; @@ -189,6 +190,7 @@ gettag(enum FeedType feedtype, const char *name, size_t namelen) default: return TagUnknown; } + /* TODO: test if checking for sort order matters performance-wise */ for (i = 0; tags[i].name; i++) if (istag(tags[i].name, tags[i].len, name, namelen)) return tags[i].id; @@ -220,7 +222,7 @@ string_buffer_realloc(String *s, size_t newlen) static void string_append(String *s, const char *data, size_t len) { - if (!len || *data == '\0') + if (!len) return; /* check if allocation is necesary, don't shrink buffer, * should be more than bufsiz ofcourse. */ @@ -231,141 +233,6 @@ string_append(String *s, const char *data, size_t len) s->data[s->len] = '\0'; } -/* Get timezone from string, return as formatted string and time offset, - * for the offset it assumes UTC. - * NOTE: only parses timezones in RFC-822, other timezones are ambiguous - * anyway. If needed you can add some yourself, like "cest", "cet" etc. */ -static int -gettimetz(const char *s, int *tzoffset) -{ - static struct tzone { - char *name; - int offhour; - int offmin; - } tzones[] = { - { "CDT", -5, 0 }, - { "CST", -6, 0 }, - { "EDT", -4, 0 }, - { "EST", -5, 0 }, - { "GMT", 0, 0 }, - { "MDT", -6, 0 }, - { "MST", -7, 0 }, - { "PDT", -7, 0 }, - { "PST", -8, 0 }, - { "UT", 0, 0 }, - { "UTC", 0, 0 }, - { "A", -1, 0 }, - { "M", -12, 0 }, - { "N", 1, 0 }, - { "Y", 12, 0 }, - { "Z", 0, 0 } - }; - char tzbuf[5] = "", *tz = "", c = '+'; - int tzhour = 0, tzmin = 0, r; - size_t i; - - /* skip milliseconds for: %Y-%m-%dT%H:%M:%S.000Z */ - if (*s == '.') { - for (s++; *s && isdigit((int)*s); s++) - ; - } - if (!*s || *s == 'Z' || *s == 'z') - goto time_ok; - /* skip whitespace */ - s = &s[strspn(s, " \t")]; - - /* look until some common timezone delimiters are found */ - for (i = 0; s[i] && isalpha((int)s[i]); i++) - ; - /* copy tz name */ - if (i >= sizeof(tzbuf)) - return -1; /* timezone too long */ - memcpy(tzbuf, s, i); - tzbuf[i] = '\0'; - - if ((sscanf(s, "%c%02d:%02d", &c, &tzhour, &tzmin)) == 3) - ; - else if (sscanf(s, "%c%02d%02d", &c, &tzhour, &tzmin) == 3) - ; - else if (sscanf(s, "%c%d", &c, &tzhour) == 2) - tzmin = 0; - else - tzhour = tzmin = 0; - if (!tzhour && !tzmin) - c = '+'; - - /* compare tz and adjust offset relative to UTC */ - for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { - if (!strcmp(tzbuf, tzones[i].name)) { - tz = "UTC"; - tzhour = tzones[i].offhour; - tzmin = tzones[i].offmin; - c = tzones[i].offhour < 0 ? '-' : '+'; - break; - } - } - tzhour = abs(tzhour); - tzmin = abs(tzmin); - -time_ok: - /* timezone set but non-match */ - if (tzbuf[0] && !tz[0]) { - tzhour = tzmin = 0; - c = '+'; - } - if (tzoffset) - *tzoffset = ((tzhour * 3600) + (tzmin * 60)) * - (c == '-' ? -1 : 1); - return 0; -} - -static char * -parseformat(const char *s, struct tm *tm) -{ - const char *formats[] = { - "%a, %d %b %Y %H:%M:%S", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%dT%H:%M:%S", - NULL - }; - char *p; - size_t i; - - for (i = 0; formats[i]; i++) - if ((p = strptime(s, formats[i], tm))) - return p; - - return NULL; -} - -static int -parsetime(const char *s, time_t *tp) -{ - time_t t; - struct tm tm; - char *p; - int tzoffset, r; - - if (!(p = parseformat(s, &tm))) - return -1; - - /* TODO - parse time format to tm - get timezone offset - convert tm to UNIX timestamp (timegm) - */ - - if (gettimetz(p, &tzoffset) == -1) - return -1; - tm.tm_isdst = -1; /* don't use DST */ - if ((t = mktime(&tm)) == -1) /* error */ - return -1; - t -= tzoffset; - if (tp) - *tp = t; - return 0; -} - /* Print text, encode TABs, newlines and '\', remove other whitespace. * Remove leading and trailing whitespace. */ static void @@ -423,10 +290,236 @@ string_print_trimmed(String *s) } } +long long +datetounix(long long year, int mon, int day, int hour, int min, int sec) +{ + static const int secs_through_month[] = { + 0, 31 * 86400, 59 * 86400, 90 * 86400, + 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, + 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; + int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; + long long t; + + if (year - 2ULL <= 136) { + leaps = (year - 68) >> 2; + if (!((year - 68) & 3)) { + leaps--; + is_leap = 1; + } else { + is_leap = 0; + } + t = 31536000 * (year - 70) + 86400 * leaps; + } else { + cycles = (year - 100) / 400; + rem = (year - 100) % 400; + if (rem < 0) { + cycles--; + rem += 400; + } + if (!rem) { + is_leap = 1; + } else { + if (rem >= 300) + centuries = 3, rem -= 300; + else if (rem >= 200) + centuries = 2, rem -= 200; + else if (rem >= 100) + centuries = 1, rem -= 100; + if (rem) { + leaps = rem / 4U; + rem %= 4U; + is_leap = !rem; + } + } + leaps += 97 * cycles + 24 * centuries - is_leap; + t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400; + } + t += secs_through_month[mon]; + if (is_leap && mon >= 2) + t += 86400; + t += 86400LL * (day - 1); + t += 3600LL * hour; + t += 60LL * min; + t += sec; + + return t; +} + +/* Get timezone from string, return time offset in seconds from UTC. + * NOTE: only parses timezones in RFC-822, other timezones are ambiguous + * anyway. If needed you can add some yourself, like "cest", "cet" etc. */ +static long long +gettzoffset(const char *s) +{ + static struct { + char *name; + size_t len; + const int offhour; + } tzones[] = { + { STRP("A"), -1 * 3600 }, + { STRP("CDT"), -5 * 3600 }, + { STRP("CST"), -6 * 3600 }, + { STRP("EDT"), -4 * 3600 }, + { STRP("EST"), -5 * 3600 }, + { STRP("GMT"), 0 }, + { STRP("MDT"), -6 * 3600 }, + { STRP("MST"), -7 * 3600 }, + { STRP("PDT"), -7 * 3600 }, + { STRP("PST"), -8 * 3600 }, + { STRP("UT"), 0 }, + { STRP("UTC"), 0 }, + { STRP("M"), -2 * 3600 }, + { STRP("N"), 1 * 3600 }, + { STRP("Y"), 12 * 3600 }, + { STRP("Z"), 0 }, + }; + const char *p; + int tzhour = 0, tzmin = 0; + size_t i, namelen; + + for (; *s && isspace((int)*s); s++) + ; + switch (s[0]) { + case '-': /* offset */ + case '+': + for (i = 0, p = s + 1; i < 2 && *p && isdigit(*p); i++, p++) + tzhour = (tzhour * 10) + (*p - '0'); + if (*p && !isdigit(*p)) + p++; + for (i = 0; i < 2 && *p && isdigit(*p); i++, p++) + tzmin = (tzmin * 10) + (*p - '0'); + return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); + default: /* timezone name */ + for (i = 0; *s && isalpha((int)s[i]); i++) + ; + namelen = i; /* end of name */ + /* optimization: these are always non-matching */ + if (namelen < 1 || namelen > 3) + return 0; + /* compare tz and adjust offset relative to UTC */ + for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { + if (tzones[i].len == namelen && + !strncmp(s, tzones[i].name, namelen)) + return tzones[i].offhour; + } + } + return 0; +} + +static int +parsetime(const char *s, time_t *tp) +{ + static struct { + char *name; + int len; + } mons[] = { + { STRP("January"), }, + { STRP("February"), }, + { STRP("March"), }, + { STRP("April"), }, + { STRP("May"), }, + { STRP("June"), }, + { STRP("July"), }, + { STRP("August"), }, + { STRP("September"), }, + { STRP("October"), }, + { STRP("November"), }, + { STRP("December"), }, + }; + const char *end = NULL; + int va[6], i, j, v, vi; + size_t m; + + for (; *s && isspace((int)*s); s++) + ; + if (!isdigit((int)*s) && !isalpha((int)*s)) + return -1; + + memset(&va, 0, sizeof(va)); + if (isdigit((int)*s)) { + /* format "%Y-%m-%d %H:%M:%S" or "%Y-%m-%dT%H:%M:%S" */ + vi = 0; +time: + for (; *s && vi < 6; vi++) { + for (i = 0, v = 0; *s && i < 4 && isdigit((int)*s); s++, i++) + v = (v * 10) + (*s - '0'); + va[vi] = v; + if ((vi < 2 && *s == '-') || + (vi == 2 && (*s == 'T' || isspace((int)*s))) || + (vi > 2 && *s == ':')) + s++; + } + /* TODO: only if seconds are parsed (vi == 5)? */ + /* skip milliseconds for: %Y-%m-%dT%H:%M:%S.000Z */ + if (*s == '.') { + for (s++; *s && isdigit((int)*s); s++) + ; + } + end = s; + } else if (isalpha((int)*s)) { + /* format: "%a, %d %b %Y %H:%M:%S" */ + /* parse "%a, %d %b %Y " part, then use time parsing as above */ + for (; *s && isalpha((int)*s); s++) + ; + for (; *s && isspace((int)*s); s++) + ; + if (*s != ',') + return -1; + for (s++; *s && isspace((int)*s); s++) + ; + for (v = 0, i = 0; *s && i < 4 && isdigit((int)*s); s++, i++) + v = (v * 10) + (*s - '0'); + va[2] = v; /* day */ + for (; *s && isspace((int)*s); s++) + ; + /* end of word month */ + for (j = 0; *s && isalpha((int)s[j]); j++) + ; + /* check month name */ + if (j < 3 || j > 9) + return -1; /* month cannot match */ + for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { + /* abbreviation (3 length) or long name */ + if ((j == 3 || j == mons[m].len) && + !strncasecmp(mons[m].name, s, j)) { + va[1] = m + 1; + s += j; + break; + } + } + if (m >= 12) + return -1; /* no month found */ + for (; *s && isspace((int)*s); s++) + ; + for (v = 0, i = 0; *s && i < 4 && isdigit((int)*s); s++, i++) + v = (v * 10) + (*s - '0'); + va[0] = v; /* year */ + for (; *s && isspace((int)*s); s++) + ; + /* parse regular time, see above */ + vi = 3; + goto time; + } + + /* invalid range */ + if (va[0] < 0 || va[0] > 9999 || + va[1] < 1 || va[1] > 12 || + va[2] < 1 || va[2] > 31 || + va[3] < 0 || va[3] > 23 || + va[4] < 0 || va[4] > 59 || + va[5] < 0 || va[5] > 59) + return -1; + + if (tp) + *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - + gettzoffset(end); + return 0; +} + static void printfields(void) { - char link[4096], timebuf[64]; + char link[4096]; time_t t; int r = -1; @@ -697,10 +790,6 @@ main(int argc, char *argv[]) if (argc > 1) baseurl = argv[1]; - if (setenv("TZ", "UTC", 1) == -1) - err(1, "setenv"); - tzset(); - parser.xmlattr = xml_handler_attr; parser.xmlattrend = xml_handler_attr_end; parser.xmlattrstart = xml_handler_attr_start; diff --git a/sfeed_frames.c b/sfeed_frames.c @@ -108,6 +108,7 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) ssize_t linelen; FILE *fpcontent = NULL; unsigned int isnew; + struct tm *tm; time_t parsedtime; int fd, r; @@ -153,6 +154,9 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); + if (!(tm = localtime(&parsedtime))) + err(1, "localtime"); + /* content file doesn't exist yet and has error? */ if ((fd = open(filepath, O_CREAT | O_EXCL | O_WRONLY, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) == -1) { @@ -208,7 +212,11 @@ printfeed(FILE *fpitems, FILE *fpin, struct feed *f) else fputs("<tr>", fpitems); fputs("<td nowrap valign=\"top\">", fpitems); - xmlencode(fields[FieldTimeFormatted], fpitems); + + fprintf(fpitems, "%04d-%02d-%02d %02d:%02d ", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min); + fputs("</td><td nowrap valign=\"top\">", fpitems); if (isnew) fputs("<b><u>", fpitems); diff --git a/sfeed_html.c b/sfeed_html.c @@ -17,6 +17,7 @@ static void printfeed(FILE *fp, struct feed *f) { char *fields[FieldLast]; + struct tm *tm; time_t parsedtime; unsigned int islink, isnew; ssize_t linelen; @@ -37,8 +38,11 @@ printfeed(FILE *fp, struct feed *f) line[--linelen] = '\0'; if (!parseline(line, fields)) break; + parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); + if (!(tm = localtime(&parsedtime))) + err(1, "localtime"); isnew = (parsedtime >= comparetime) ? 1 : 0; islink = (fields[FieldLink][0] != '\0') ? 1 : 0; @@ -52,7 +56,9 @@ printfeed(FILE *fp, struct feed *f) else fputs("<tr>", stdout); fputs("<td nowrap valign=\"top\">", stdout); - fputs(fields[FieldTimeFormatted], stdout); + fprintf(stdout, "%04d-%02d-%02d&nbsp;%02d:%02d", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min); fputs("</td><td nowrap valign=\"top\">", stdout); if (isnew) fputs("<b><u>", stdout); diff --git a/sfeed_plain.c b/sfeed_plain.c @@ -15,6 +15,7 @@ static void printfeed(FILE *fp, const char *feedname) { char *fields[FieldLast]; + struct tm *tm; time_t parsedtime; ssize_t linelen; @@ -26,6 +27,8 @@ printfeed(FILE *fp, const char *feedname) parsedtime = 0; strtotime(fields[FieldUnixTimestamp], &parsedtime); + if (!(tm = localtime(&parsedtime))) + err(1, "localtime"); if (parsedtime >= comparetime) fputs("N ", stdout); @@ -33,8 +36,11 @@ printfeed(FILE *fp, const char *feedname) fputs(" ", stdout); if (feedname[0]) - printf("%-15.15s ", feedname); - printf("%-30.30s ", fields[FieldTimeFormatted]); + printf("%-15.15s ", feedname); + + fprintf(stdout, "%04d-%02d-%02d %02d:%02d ", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min); printutf8pad(stdout, fields[FieldTitle], 70, ' '); printf(" %s\n", fields[FieldLink]); } diff --git a/sfeed_tail.c b/sfeed_tail.c @@ -35,6 +35,8 @@ printfeed(FILE *fp, const char *feedname) uint32_t hash; int uniq; ssize_t linelen; + time_t parsedtime; + struct tm *tm; while ((linelen = getline(&line, &linesize, fp)) > 0) { if (line[linelen - 1] == '\n') @@ -65,9 +67,18 @@ printfeed(FILE *fp, const char *feedname) continue; if (!parseline(line, fields)) break; + + parsedtime = 0; + strtotime(fields[FieldUnixTimestamp], &parsedtime); + if (!(tm = localtime(&parsedtime))) + err(1, "localtime"); + if (feedname[0]) - printf("%-15.15s %-30.30s", - feedname, fields[FieldTimeFormatted]); + printf("%-15.15s ", feedname); + + printf("%04d-%02d-%02d %02d:%02d ", + tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, + tm->tm_hour, tm->tm_min); printutf8pad(stdout, fields[FieldTitle], 70, ' '); printf(" %s\n", fields[FieldLink]); }