util.c - sfeed - Simple RSS and Atom feed parser

	sfeed Simple RSS and Atom feed parser
	git clone https://git.sinitax.com/codemadness/sfeed
	Log \| Files \| Refs \| README \| LICENSE \| Upstream \| sfeed.txt
util.c (9301B)
      1#include <errno.h>
      2#include <stdarg.h>
      3#include <stdio.h>
      4#include <stdlib.h>
      5#include <string.h>
      6#include <wchar.h>
      7
      8#include "util.h"
      9
     10/* print to stderr, print error message of errno and exit().
     11   Unlike BSD err() it does not prefix __progname */
     12__dead void
     13err(int exitstatus, const char *fmt, ...)
     14{
     15	va_list ap;
     16	int saved_errno;
     17
     18	saved_errno = errno;
     19
     20	if (fmt) {
     21		va_start(ap, fmt);
     22		vfprintf(stderr, fmt, ap);
     23		va_end(ap);
     24		fputs(": ", stderr);
     25	}
     26	fprintf(stderr, "%s\n", strerror(saved_errno));
     27
     28	exit(exitstatus);
     29}
     30
     31/* print to stderr and exit().
     32   Unlike BSD errx() it does not prefix __progname */
     33__dead void
     34errx(int exitstatus, const char *fmt, ...)
     35{
     36	va_list ap;
     37
     38	if (fmt) {
     39		va_start(ap, fmt);
     40		vfprintf(stderr, fmt, ap);
     41		va_end(ap);
     42	}
     43	fputs("\n", stderr);
     44
     45	exit(exitstatus);
     46}
     47
     48/* Handle read or write errors for a FILE * stream */
     49void
     50checkfileerror(FILE *fp, const char *name, int mode)
     51{
     52	if (mode == 'r' && ferror(fp))
     53		errx(1, "read error: %s", name);
     54	else if (mode == 'w' && (fflush(fp) || ferror(fp)))
     55		errx(1, "write error: %s", name);
     56}
     57
     58/* strcasestr() included for portability */
     59char *
     60strcasestr(const char *h, const char *n)
     61{
     62	size_t i;
     63
     64	if (!n[0])
     65		return (char *)h;
     66
     67	for (; *h; ++h) {
     68		for (i = 0; n[i] && TOLOWER((unsigned char)n[i]) ==
     69		            TOLOWER((unsigned char)h[i]); ++i)
     70			;
     71		if (n[i] == '\0')
     72			return (char *)h;
     73	}
     74
     75	return NULL;
     76}
     77
     78/* Check if string has a non-empty scheme / protocol part. */
     79int
     80uri_hasscheme(const char *s)
     81{
     82	const char *p = s;
     83
     84	for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
     85		       *p == '+' || *p == '-' || *p == '.'; p++)
     86		;
     87	/* scheme, except if empty and starts with ":" then it is a path */
     88	return (*p == ':' && p != s);
     89}
     90
     91/* Parse URI string `s` into an uri structure `u`.
     92   Returns 0 on success or -1 on failure */
     93int
     94uri_parse(const char *s, struct uri *u)
     95{
     96	const char *p = s;
     97	char *endptr;
     98	size_t i;
     99	long l;
    100
    101	u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
    102	u->path[0] = u->query[0] = u->fragment[0] = '\0';
    103
    104	/* protocol-relative */
    105	if (*p == '/' && *(p + 1) == '/') {
    106		p += 2; /* skip "//" */
    107		goto parseauth;
    108	}
    109
    110	/* scheme / protocol part */
    111	for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
    112		       *p == '+' || *p == '-' || *p == '.'; p++)
    113		;
    114	/* scheme, except if empty and starts with ":" then it is a path */
    115	if (*p == ':' && p != s) {
    116		if (*(p + 1) == '/' && *(p + 2) == '/')
    117			p += 3; /* skip "://" */
    118		else
    119			p++; /* skip ":" */
    120
    121		if ((size_t)(p - s) >= sizeof(u->proto))
    122			return -1; /* protocol too long */
    123		memcpy(u->proto, s, p - s);
    124		u->proto[p - s] = '\0';
    125
    126		if (*(p - 1) != '/')
    127			goto parsepath;
    128	} else {
    129		p = s; /* no scheme format, reset to start */
    130		goto parsepath;
    131	}
    132
    133parseauth:
    134	/* userinfo (username:password) */
    135	i = strcspn(p, "@/?#");
    136	if (p[i] == '@') {
    137		if (i >= sizeof(u->userinfo))
    138			return -1; /* userinfo too long */
    139		memcpy(u->userinfo, p, i);
    140		u->userinfo[i] = '\0';
    141		p += i + 1;
    142	}
    143
    144	/* IPv6 address */
    145	if (*p == '[') {
    146		/* bracket not found, host too short or too long */
    147		i = strcspn(p, "]");
    148		if (p[i] != ']' || i < 3)
    149			return -1;
    150		i++; /* including "]" */
    151	} else {
    152		/* domain / host part, skip until port, path or end. */
    153		i = strcspn(p, ":/?#");
    154	}
    155	if (i >= sizeof(u->host))
    156		return -1; /* host too long */
    157	memcpy(u->host, p, i);
    158	u->host[i] = '\0';
    159	p += i;
    160
    161	/* port */
    162	if (*p == ':') {
    163		p++;
    164		if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
    165			return -1; /* port too long */
    166		memcpy(u->port, p, i);
    167		u->port[i] = '\0';
    168		/* check for valid port: range 1 - 65535, may be empty */
    169		errno = 0;
    170		l = strtol(u->port, &endptr, 10);
    171		if (i && (errno || *endptr || l <= 0 || l > 65535))
    172			return -1;
    173		p += i;
    174	}
    175
    176parsepath:
    177	/* path */
    178	if ((i = strcspn(p, "?#")) >= sizeof(u->path))
    179		return -1; /* path too long */
    180	memcpy(u->path, p, i);
    181	u->path[i] = '\0';
    182	p += i;
    183
    184	/* query */
    185	if (*p == '?') {
    186		p++;
    187		if ((i = strcspn(p, "#")) >= sizeof(u->query))
    188			return -1; /* query too long */
    189		memcpy(u->query, p, i);
    190		u->query[i] = '\0';
    191		p += i;
    192	}
    193
    194	/* fragment */
    195	if (*p == '#') {
    196		p++;
    197		if ((i = strlen(p)) >= sizeof(u->fragment))
    198			return -1; /* fragment too long */
    199		memcpy(u->fragment, p, i);
    200		u->fragment[i] = '\0';
    201	}
    202
    203	return 0;
    204}
    205
    206/* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
    207   Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
    208   Returns 0 on success, -1 on error or truncation. */
    209int
    210uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
    211{
    212	char *p;
    213	int c;
    214
    215	strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
    216
    217	if (u->proto[0] || u->host[0]) {
    218		strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
    219		strlcpy(a->host, u->host, sizeof(a->host));
    220		strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
    221		strlcpy(a->host, u->host, sizeof(a->host));
    222		strlcpy(a->port, u->port, sizeof(a->port));
    223		strlcpy(a->path, u->path, sizeof(a->path));
    224		strlcpy(a->query, u->query, sizeof(a->query));
    225		return 0;
    226	}
    227
    228	strlcpy(a->proto, b->proto, sizeof(a->proto));
    229	strlcpy(a->host, b->host, sizeof(a->host));
    230	strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
    231	strlcpy(a->host, b->host, sizeof(a->host));
    232	strlcpy(a->port, b->port, sizeof(a->port));
    233
    234	if (!u->path[0]) {
    235		strlcpy(a->path, b->path, sizeof(a->path));
    236	} else if (u->path[0] == '/') {
    237		strlcpy(a->path, u->path, sizeof(a->path));
    238	} else {
    239		a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
    240		a->path[1] = '\0';
    241
    242		if ((p = strrchr(b->path, '/'))) {
    243			c = *(++p);
    244			*p = '\0'; /* temporary NUL-terminate */
    245			if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
    246				return -1;
    247			*p = c; /* restore */
    248		}
    249		if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
    250			return -1;
    251	}
    252
    253	if (u->path[0] || u->query[0])
    254		strlcpy(a->query, u->query, sizeof(a->query));
    255	else
    256		strlcpy(a->query, b->query, sizeof(a->query));
    257
    258	return 0;
    259}
    260
    261int
    262uri_format(char *buf, size_t bufsiz, struct uri *u)
    263{
    264	return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
    265		u->proto,
    266		u->userinfo[0] ? u->userinfo : "",
    267		u->userinfo[0] ? "@" : "",
    268		u->host,
    269		u->port[0] ? ":" : "",
    270		u->port,
    271		u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
    272		u->path,
    273		u->query[0] ? "?" : "",
    274		u->query,
    275		u->fragment[0] ? "#" : "",
    276		u->fragment);
    277}
    278
    279/* Splits fields in the line buffer by replacing TAB separators with NUL ('\0')
    280 * terminators and assign these fields as pointers. If there are less fields
    281 * than expected then the field is an empty string constant. */
    282void
    283parseline(char *line, char *fields[FieldLast])
    284{
    285	char *prev, *s;
    286	size_t i;
    287
    288	for (prev = line, i = 0;
    289	    (s = strchr(prev, '\t')) && i < FieldLast - 1;
    290	    i++) {
    291		*s = '\0';
    292		fields[i] = prev;
    293		prev = s + 1;
    294	}
    295	fields[i++] = prev;
    296	/* make non-parsed fields empty. */
    297	for (; i < FieldLast; i++)
    298		fields[i] = "";
    299}
    300
    301/* Parse time to time_t, assumes time_t is signed, ignores fractions. */
    302int
    303strtotime(const char *s, time_t *t)
    304{
    305	long long l;
    306	char *e;
    307
    308	errno = 0;
    309	l = strtoll(s, &e, 10);
    310	if (errno || *s == '\0' || *e)
    311		return -1;
    312
    313	/* NOTE: the type long long supports the 64-bit range. If time_t is
    314	   64-bit it is "2038-ready", otherwise it is truncated/wrapped. */
    315	if (t)
    316		*t = (time_t)l;
    317
    318	return 0;
    319}
    320
    321time_t
    322getcomparetime(void)
    323{
    324	time_t now, t;
    325	char *p;
    326
    327	if ((now = time(NULL)) == (time_t)-1)
    328		return (time_t)-1;
    329
    330	if ((p = getenv("SFEED_NEW_AGE"))) {
    331		if (strtotime(p, &t) == -1)
    332			return (time_t)-1;
    333		return now - t;
    334	}
    335
    336	return now - 86400; /* 1 day is old news */
    337}
    338
    339/* Escape characters below as HTML 2.0 / XML 1.0. */
    340void
    341xmlencode(const char *s, FILE *fp)
    342{
    343	for (; *s; ++s) {
    344		switch (*s) {
    345		case '<':  fputs("&lt;",   fp); break;
    346		case '>':  fputs("&gt;",   fp); break;
    347		case '\'': fputs("&#39;",  fp); break;
    348		case '&':  fputs("&amp;",  fp); break;
    349		case '"':  fputs("&quot;", fp); break;
    350		default:   putc(*s, fp);
    351		}
    352	}
    353}
    354
    355/* print `len` columns of characters. If string is shorter pad the rest with
    356 * characters `pad`. */
    357void
    358printutf8pad(FILE *fp, const char *s, size_t len, int pad)
    359{
    360	wchar_t wc;
    361	size_t col = 0, i, slen;
    362	int inc, rl, w;
    363
    364	if (!len)
    365		return;
    366
    367	slen = strlen(s);
    368	for (i = 0; i < slen; i += inc) {
    369		inc = 1; /* next byte */
    370		if ((unsigned char)s[i] < 32) {
    371			continue; /* skip control characters */
    372		} else if ((unsigned char)s[i] >= 127) {
    373			rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4);
    374			inc = rl;
    375			if (rl < 0) {
    376				mbtowc(NULL, NULL, 0); /* reset state */
    377				inc = 1; /* invalid, seek next byte */
    378				w = 1; /* replacement char is one width */
    379			} else if ((w = wcwidth(wc)) == -1) {
    380				continue;
    381			}
    382
    383			if (col + w > len || (col + w == len && s[i + inc])) {
    384				fputs(PAD_TRUNCATE_SYMBOL, fp); /* ellipsis */
    385				col++;
    386				break;
    387			} else if (rl < 0) {
    388				fputs(UTF_INVALID_SYMBOL, fp); /* replacement */
    389				col++;
    390				continue;
    391			}
    392			fwrite(&s[i], 1, rl, fp);
    393			col += w;
    394		} else {
    395			/* optimization: simple ASCII character */
    396			if (col + 1 > len || (col + 1 == len && s[i + 1])) {
    397				fputs(PAD_TRUNCATE_SYMBOL, fp); /* ellipsis */
    398				col++;
    399				break;
    400			}
    401			putc(s[i], fp);
    402			col++;
    403		}
    404
    405	}
    406	for (; col < len; ++col)
    407		putc(pad, fp);
    408}