sfeed.c - sfeed - Simple RSS and Atom feed parser

	sfeed Simple RSS and Atom feed parser
	git clone https://git.sinitax.com/codemadness/sfeed
	Log \| Files \| Refs \| README \| LICENSE \| Upstream \| sfeed.txt
sfeed.c (30040B)
      1#include <errno.h>
      2#include <stdint.h>
      3#include <stdio.h>
      4#include <stdlib.h>
      5#include <string.h>
      6#include <strings.h>
      7
      8#include "util.h"
      9#include "xml.h"
     10
     11#define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
     12#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
     13
     14/* these feed fields support multiple separated values */
     15#define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
     16
     17/* string and byte-length */
     18#define STRP(s)           s,sizeof(s)-1
     19
     20enum FeedType {
     21	FeedTypeNone = 0,
     22	FeedTypeRSS  = 1,
     23	FeedTypeAtom = 2
     24};
     25
     26enum ContentType {
     27	ContentTypeNone  = 0,
     28	ContentTypePlain = 1,
     29	ContentTypeHTML  = 2
     30};
     31static const char *contenttypes[] = { "", "plain", "html" };
     32
     33/* String data / memory pool */
     34typedef struct string {
     35	char   *data;   /* data */
     36	size_t  len;    /* string length */
     37	size_t  bufsiz; /* allocated size */
     38} String;
     39
     40/* NOTE: the order of these fields (content, date, author) indicate the
     41 *       priority to use them, from least important to high. */
     42enum TagId {
     43	TagUnknown = 0,
     44	/* RSS */
     45	RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
     46	RSSTagTitle,
     47	RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
     48	RSSTagGuid,
     49	RSSTagGuidPermalinkFalse,
     50	RSSTagGuidPermalinkTrue,
     51	/* must be defined after GUID, because it can be a link (isPermaLink) */
     52	RSSTagLink,
     53	RSSTagEnclosure,
     54	RSSTagAuthor, RSSTagDccreator,
     55	RSSTagCategory,
     56	/* Atom */
     57	/* creation date has higher priority */
     58	AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
     59	AtomTagTitle,
     60	AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
     61	AtomTagId,
     62	AtomTagLink,
     63	AtomTagLinkAlternate,
     64	AtomTagLinkEnclosure,
     65	AtomTagAuthor, AtomTagAuthorName,
     66	AtomTagCategory,
     67	TagLast
     68};
     69
     70typedef struct feedtag {
     71	char       *name; /* name of tag to match */
     72	size_t      len;  /* len of `name` */
     73	enum TagId  id;   /* unique ID */
     74} FeedTag;
     75
     76typedef struct field {
     77	String     str;
     78	enum TagId tagid; /* tagid set previously, used for tag priority */
     79} FeedField;
     80
     81enum {
     82	FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
     83	FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
     84	FeedFieldLast
     85};
     86
     87typedef struct feedcontext {
     88	String          *field;        /* current FeedItem field String */
     89	FeedField        fields[FeedFieldLast]; /* data for current item */
     90	FeedTag          tag;          /* unique current parsed tag */
     91	int              iscontent;    /* in content data */
     92	int              iscontenttag; /* in content tag */
     93	enum ContentType contenttype;  /* content-type for item */
     94	enum FeedType    feedtype;
     95	int              attrcount;    /* count item HTML element attributes */
     96} FeedContext;
     97
     98static long long datetounix(long long, int, int, int, int, int);
     99static FeedTag * gettag(enum FeedType, const char *, size_t);
    100static long gettzoffset(const char *);
    101static int  isattr(const char *, size_t, const char *, size_t);
    102static int  istag(const char *, size_t, const char *, size_t);
    103static int  parsetime(const char *, long long *);
    104static void printfields(void);
    105static void string_append(String *, const char *, size_t);
    106static void string_buffer_realloc(String *, size_t);
    107static void string_clear(String *);
    108static void string_print_encoded(String *);
    109static void string_print_timestamp(String *);
    110static void string_print_trimmed(String *);
    111static void string_print_trimmed_multi(String *);
    112static void string_print_uri(String *);
    113static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
    114                    const char *, size_t);
    115static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
    116                          size_t, const char *, size_t);
    117static void xmlattrend(XMLParser *, const char *, size_t, const char *,
    118                       size_t);
    119static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
    120                         size_t);
    121static void xmldata(XMLParser *, const char *, size_t);
    122static void xmldataentity(XMLParser *, const char *, size_t);
    123static void xmltagend(XMLParser *, const char *, size_t, int);
    124static void xmltagstart(XMLParser *, const char *, size_t);
    125static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
    126
    127/* map tag name to TagId type */
    128/* RSS, must be alphabetical order */
    129static const FeedTag rsstags[] = {
    130	{ STRP("author"),            RSSTagAuthor            },
    131	{ STRP("category"),          RSSTagCategory          },
    132	{ STRP("content:encoded"),   RSSTagContentEncoded    },
    133	{ STRP("dc:creator"),        RSSTagDccreator         },
    134	{ STRP("dc:date"),           RSSTagDcdate            },
    135	{ STRP("description"),       RSSTagDescription       },
    136	/* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
    137	{ STRP("enclosure"),         RSSTagEnclosure         },
    138	{ STRP("guid"),              RSSTagGuid              },
    139	{ STRP("link"),              RSSTagLink              },
    140	{ STRP("media:description"), RSSTagMediaDescription  },
    141	{ STRP("pubdate"),           RSSTagPubdate           },
    142	{ STRP("title"),             RSSTagTitle             }
    143};
    144
    145/* Atom, must be alphabetical order */
    146static const FeedTag atomtags[] = {
    147	{ STRP("author"),            AtomTagAuthor           },
    148	{ STRP("category"),          AtomTagCategory         },
    149	{ STRP("content"),           AtomTagContent          },
    150	{ STRP("id"),                AtomTagId               },
    151	{ STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
    152	/* Atom: <link href="" />, RSS has <link></link> */
    153	{ STRP("link"),              AtomTagLink             },
    154	{ STRP("media:description"), AtomTagMediaDescription },
    155	{ STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
    156	{ STRP("published"),         AtomTagPublished        },
    157	{ STRP("summary"),           AtomTagSummary          },
    158	{ STRP("title"),             AtomTagTitle            },
    159	{ STRP("updated"),           AtomTagUpdated          }
    160};
    161
    162/* special case: nested <author><name> */
    163static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
    164static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
    165
    166/* reference to no / unknown tag */
    167static const FeedTag notag = { STRP(""), TagUnknown };
    168
    169/* map TagId type to RSS/Atom field, all tags must be defined */
    170static const int fieldmap[TagLast] = {
    171	[TagUnknown]               = -1,
    172	/* RSS */
    173	[RSSTagDcdate]             = FeedFieldTime,
    174	[RSSTagPubdate]            = FeedFieldTime,
    175	[RSSTagTitle]              = FeedFieldTitle,
    176	[RSSTagMediaDescription]   = FeedFieldContent,
    177	[RSSTagDescription]        = FeedFieldContent,
    178	[RSSTagContentEncoded]     = FeedFieldContent,
    179	[RSSTagGuid]               = -1,
    180	[RSSTagGuidPermalinkFalse] = FeedFieldId,
    181	[RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special-case: both a link and an id */
    182	[RSSTagLink]               = FeedFieldLink,
    183	[RSSTagEnclosure]          = FeedFieldEnclosure,
    184	[RSSTagAuthor]             = FeedFieldAuthor,
    185	[RSSTagDccreator]          = FeedFieldAuthor,
    186	[RSSTagCategory]           = FeedFieldCategory,
    187	/* Atom */
    188	[AtomTagModified]          = FeedFieldTime,
    189	[AtomTagUpdated]           = FeedFieldTime,
    190	[AtomTagIssued]            = FeedFieldTime,
    191	[AtomTagPublished]         = FeedFieldTime,
    192	[AtomTagTitle]             = FeedFieldTitle,
    193	[AtomTagMediaDescription]  = FeedFieldContent,
    194	[AtomTagSummary]           = FeedFieldContent,
    195	[AtomTagContent]           = FeedFieldContent,
    196	[AtomTagId]                = FeedFieldId,
    197	[AtomTagLink]              = -1,
    198	[AtomTagLinkAlternate]     = FeedFieldLink,
    199	[AtomTagLinkEnclosure]     = FeedFieldEnclosure,
    200	[AtomTagAuthor]            = -1,
    201	[AtomTagAuthorName]        = FeedFieldAuthor,
    202	[AtomTagCategory]          = FeedFieldCategory
    203};
    204
    205static const int FieldSeparator = '\t';
    206/* separator for multiple values in a field, separator should be 1 byte */
    207static const char FieldMultiSeparator[] = "|";
    208static struct uri baseuri;
    209static const char *baseurl;
    210
    211static FeedContext ctx;
    212static XMLParser parser; /* XML parser state */
    213static String attrispermalink, attrrel, attrtype, tmpstr;
    214
    215static int
    216tagcmp(const void *v1, const void *v2)
    217{
    218	return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name);
    219}
    220
    221/* Unique tagid for parsed tag name. */
    222static FeedTag *
    223gettag(enum FeedType feedtype, const char *name, size_t namelen)
    224{
    225	FeedTag f, *r = NULL;
    226
    227	f.name = (char *)name;
    228
    229	switch (feedtype) {
    230	case FeedTypeRSS:
    231		r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]),
    232		        sizeof(rsstags[0]), tagcmp);
    233		break;
    234	case FeedTypeAtom:
    235		r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]),
    236		        sizeof(atomtags[0]), tagcmp);
    237		break;
    238	default:
    239		break;
    240	}
    241
    242	return r;
    243}
    244
    245static char *
    246ltrim(const char *s)
    247{
    248	for (; ISSPACE((unsigned char)*s); s++)
    249		;
    250	return (char *)s;
    251}
    252
    253static char *
    254rtrim(const char *s)
    255{
    256	const char *e;
    257
    258	for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
    259		;
    260	return (char *)e;
    261}
    262
    263/* Clear string only; don't free, prevents unnecessary reallocation. */
    264static void
    265string_clear(String *s)
    266{
    267	if (s->data)
    268		s->data[0] = '\0';
    269	s->len = 0;
    270}
    271
    272static void
    273string_buffer_realloc(String *s, size_t newlen)
    274{
    275	size_t alloclen;
    276
    277	if (newlen > SIZE_MAX / 2) {
    278		alloclen = SIZE_MAX;
    279	} else {
    280		for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
    281			;
    282	}
    283	if (!(s->data = realloc(s->data, alloclen)))
    284		err(1, "realloc");
    285	s->bufsiz = alloclen;
    286}
    287
    288/* Append data to String, s->data and data may not overlap. */
    289static void
    290string_append(String *s, const char *data, size_t len)
    291{
    292	if (!len)
    293		return;
    294
    295	if (s->len >= SIZE_MAX - len) {
    296		errno = ENOMEM;
    297		err(1, "realloc");
    298	}
    299
    300	/* check if allocation is necessary, never shrink the buffer. */
    301	if (s->len + len >= s->bufsiz)
    302		string_buffer_realloc(s, s->len + len + 1);
    303	memcpy(s->data + s->len, data, len);
    304	s->len += len;
    305	s->data[s->len] = '\0';
    306}
    307
    308/* Print text, encode TABs, newlines and '\', remove other whitespace.
    309 * Remove leading and trailing whitespace. */
    310static void
    311string_print_encoded(String *s)
    312{
    313	const char *p, *e;
    314
    315	if (!s->data || !s->len)
    316		return;
    317
    318	p = ltrim(s->data);
    319	e = rtrim(p);
    320
    321	for (; *p && p != e; p++) {
    322		switch (*p) {
    323		case '\n': putchar('\\'); putchar('n'); break;
    324		case '\\': putchar('\\'); putchar('\\'); break;
    325		case '\t': putchar('\\'); putchar('t'); break;
    326		default:
    327			/* ignore control chars */
    328			if (!ISCNTRL((unsigned char)*p))
    329				putchar(*p);
    330			break;
    331		}
    332	}
    333}
    334
    335static void
    336printtrimmed(const char *s)
    337{
    338	char *p, *e;
    339
    340	p = ltrim(s);
    341	e = rtrim(p);
    342	for (; *p && p != e; p++) {
    343		if (ISSPACE((unsigned char)*p))
    344			putchar(' '); /* any whitespace to space */
    345		else if (!ISCNTRL((unsigned char)*p))
    346			/* ignore other control chars */
    347			putchar(*p);
    348	}
    349}
    350
    351/* Print text, replace TABs, carriage return and other whitespace with ' '.
    352 * Other control chars are removed. Remove leading and trailing whitespace. */
    353static void
    354string_print_trimmed(String *s)
    355{
    356	if (!s->data || !s->len)
    357		return;
    358
    359	printtrimmed(s->data);
    360}
    361
    362/* Print each field with trimmed whitespace, separated by '|'. */
    363static void
    364string_print_trimmed_multi(String *s)
    365{
    366	char *p, *e;
    367	int c;
    368
    369	if (!s->data || !s->len)
    370		return;
    371
    372	for (p = s->data; ; p = e + 1) {
    373		if ((e = strstr(p, FieldMultiSeparator))) {
    374			c = *e;
    375			*e = '\0';
    376			printtrimmed(p);
    377			*e = c; /* restore NUL byte to original character */
    378			fputs(FieldMultiSeparator, stdout);
    379		} else {
    380			printtrimmed(p);
    381			break;
    382		}
    383	}
    384}
    385
    386/* Print URL, if it is a relative URL then it uses the global `baseurl`. */
    387static void
    388printuri(char *s)
    389{
    390	char link[4096], *p, *e;
    391	struct uri newuri, olduri;
    392	int c, r = -1;
    393
    394	p = ltrim(s);
    395	e = rtrim(p);
    396	c = *e;
    397	*e = '\0';
    398
    399	if (baseurl && !uri_hasscheme(p) &&
    400	    uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
    401	    uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
    402		r = uri_format(link, sizeof(link), &newuri);
    403
    404	if (r >= 0 && (size_t)r < sizeof(link))
    405		printtrimmed(link);
    406	else
    407		printtrimmed(p);
    408
    409	*e = c; /* restore NUL byte to original character */
    410}
    411
    412/* Print URL, if it is a relative URL then it uses the global `baseurl`. */
    413static void
    414string_print_uri(String *s)
    415{
    416	if (!s->data || !s->len)
    417		return;
    418
    419	printuri(s->data);
    420}
    421
    422/* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
    423static void
    424string_print_timestamp(String *s)
    425{
    426	long long t;
    427
    428	if (!s->data || !s->len)
    429		return;
    430
    431	if (parsetime(s->data, &t) != -1)
    432		printf("%lld", t);
    433}
    434
    435/* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
    436   Parameters should be passed as they are in a struct tm:
    437   that is: year = year - 1900, month = month - 1. */
    438static long long
    439datetounix(long long year, int mon, int day, int hour, int min, int sec)
    440{
    441	/* seconds in a month in a regular (non-leap) year */
    442	static const long secs_through_month[] = {
    443		0, 31 * 86400, 59 * 86400, 90 * 86400,
    444		120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
    445		243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
    446	int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
    447	long long t;
    448
    449	/* optimization: handle common range year 1902 up to and including 2038 */
    450	if (year - 2ULL <= 136) {
    451		/* amount of leap days relative to 1970: every 4 years */
    452		leaps = (year - 68) >> 2;
    453		if (!((year - 68) & 3)) {
    454			leaps--;
    455			is_leap = 1;
    456		} else {
    457			is_leap = 0;
    458		}
    459		t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
    460	} else {
    461		/* general leap year calculation:
    462		   leap years occur mostly every 4 years but every 100 years
    463		   a leap year is skipped unless the year is divisible by 400 */
    464		cycles = (year - 100) / 400;
    465		rem = (year - 100) % 400;
    466		if (rem < 0) {
    467			cycles--;
    468			rem += 400;
    469		}
    470		if (!rem) {
    471			is_leap = 1;
    472		} else {
    473			if (rem >= 300) {
    474				centuries = 3;
    475				rem -= 300;
    476			} else if (rem >= 200) {
    477				centuries = 2;
    478				rem -= 200;
    479			} else if (rem >= 100) {
    480				centuries = 1;
    481				rem -= 100;
    482			}
    483			if (rem) {
    484				leaps = rem / 4U;
    485				rem %= 4U;
    486				is_leap = !rem;
    487			}
    488		}
    489		leaps += (97 * cycles) + (24 * centuries) - is_leap;
    490
    491		/* adjust 8 leap days from 1970 up to and including 2000:
    492		   ((30 * 365) + 8) * 86400 = 946771200 */
    493		t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
    494	}
    495	t += secs_through_month[mon];
    496	if (is_leap && mon >= 2)
    497		t += 86400;
    498	t += 86400LL * (day - 1);
    499	t += 3600LL * hour;
    500	t += 60LL * min;
    501	t += sec;
    502
    503	return t;
    504}
    505
    506/* Get timezone from string, return time offset in seconds from UTC.
    507 * NOTE: only parses timezones in RFC 822, many other timezone names are
    508 * ambiguous anyway.
    509 * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
    510 * see note on RFC 2822 4.3 page 32. */
    511static long
    512gettzoffset(const char *s)
    513{
    514	static const struct {
    515		char *name;
    516		int offhour;
    517	} tzones[] = {
    518		{ "CDT", -5 * 3600 },
    519		{ "CST", -6 * 3600 },
    520		{ "EDT", -4 * 3600 },
    521		{ "EST", -5 * 3600 },
    522		{ "MDT", -6 * 3600 },
    523		{ "MST", -7 * 3600 },
    524		{ "PDT", -7 * 3600 },
    525		{ "PST", -8 * 3600 },
    526	};
    527	const char *p;
    528	long tzhour = 0, tzmin = 0;
    529	size_t i;
    530
    531	for (; ISSPACE((unsigned char)*s); s++)
    532		;
    533	switch (*s) {
    534	case '-': /* offset */
    535	case '+':
    536		for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
    537			tzhour = (tzhour * 10) + (*p - '0');
    538		if (*p == ':')
    539			p++;
    540		for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
    541			tzmin = (tzmin * 10) + (*p - '0');
    542		return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
    543	default: /* timezone name */
    544		for (i = 0; ISALPHA((unsigned char)s[i]); i++)
    545			;
    546		if (i != 3)
    547			return 0;
    548		/* compare timezone and adjust offset relative to UTC */
    549		for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
    550			if (!memcmp(s, tzones[i].name, 3))
    551				return tzones[i].offhour;
    552		}
    553	}
    554	return 0;
    555}
    556
    557/* Parse time string `s` into the UNIX timestamp `tp`.
    558   Returns 0 on success or -1 on failure. */
    559static int
    560parsetime(const char *s, long long *tp)
    561{
    562	static const struct {
    563		char *name;
    564		int len;
    565	} mons[] = {
    566		{ STRP("January"),   },
    567		{ STRP("February"),  },
    568		{ STRP("March"),     },
    569		{ STRP("April"),     },
    570		{ STRP("May"),       },
    571		{ STRP("June"),      },
    572		{ STRP("July"),      },
    573		{ STRP("August"),    },
    574		{ STRP("September"), },
    575		{ STRP("October"),   },
    576		{ STRP("November"),  },
    577		{ STRP("December"),  },
    578	};
    579	int va[6] = { 0 }, i, j, v, vi;
    580	size_t m;
    581
    582	for (; ISSPACE((unsigned char)*s); s++)
    583		;
    584	if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
    585		return -1;
    586
    587	if (ISDIGIT((unsigned char)s[0]) &&
    588	    ISDIGIT((unsigned char)s[1]) &&
    589	    ISDIGIT((unsigned char)s[2]) &&
    590	    ISDIGIT((unsigned char)s[3])) {
    591		/* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
    592		vi = 0;
    593	} else {
    594		/* format: "[%a, ]%d %b %Y %H:%M:%S" */
    595		/* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
    596		for (; ISALPHA((unsigned char)*s); s++)
    597			;
    598		for (; ISSPACE((unsigned char)*s); s++)
    599			;
    600		if (*s == ',')
    601			s++;
    602		for (; ISSPACE((unsigned char)*s); s++)
    603			;
    604		for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
    605			v = (v * 10) + (*s - '0');
    606		va[2] = v; /* day */
    607		for (; ISSPACE((unsigned char)*s); s++)
    608			;
    609		/* end of word month */
    610		for (j = 0; ISALPHA((unsigned char)s[j]); j++)
    611			;
    612		/* check month name */
    613		if (j < 3 || j > 9)
    614			return -1; /* month cannot match */
    615		for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
    616			/* abbreviation (3 length) or long name */
    617			if ((j == 3 || j == mons[m].len) &&
    618			    !strncasecmp(mons[m].name, s, j)) {
    619				va[1] = m + 1;
    620				s += j;
    621				break;
    622			}
    623		}
    624		if (m >= 12)
    625			return -1; /* no month found */
    626		for (; ISSPACE((unsigned char)*s); s++)
    627			;
    628		for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
    629			v = (v * 10) + (*s - '0');
    630		/* obsolete short year: RFC 2822 4.3 */
    631		if (i == 2 || i == 3)
    632			v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
    633		va[0] = v; /* year */
    634		for (; ISSPACE((unsigned char)*s); s++)
    635			;
    636		/* parse only regular time part, see below */
    637		vi = 3;
    638	}
    639
    640	/* parse time parts (and possibly remaining date parts) */
    641	for (; *s && vi < 6; vi++) {
    642		for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
    643		                   ISDIGIT((unsigned char)*s); s++, i++) {
    644			v = (v * 10) + (*s - '0');
    645		}
    646		va[vi] = v;
    647
    648		if ((vi < 2 && *s == '-') ||
    649		    (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
    650		    (vi > 2 && *s == ':'))
    651			s++;
    652	}
    653
    654	/* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
    655	if (*s == '.') {
    656		for (s++; ISDIGIT((unsigned char)*s); s++)
    657			;
    658	}
    659
    660	/* invalid range */
    661	if (va[0] < 0 || va[0] > 9999 ||
    662	    va[1] < 1 || va[1] > 12 ||
    663	    va[2] < 1 || va[2] > 31 ||
    664	    va[3] < 0 || va[3] > 23 ||
    665	    va[4] < 0 || va[4] > 59 ||
    666	    va[5] < 0 || va[5] > 60) /* allow leap second */
    667		return -1;
    668
    669	*tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
    670	      gettzoffset(s);
    671
    672	return 0;
    673}
    674
    675static void
    676printfields(void)
    677{
    678	string_print_timestamp(&ctx.fields[FeedFieldTime].str);
    679	putchar(FieldSeparator);
    680	string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
    681	putchar(FieldSeparator);
    682	string_print_uri(&ctx.fields[FeedFieldLink].str);
    683	putchar(FieldSeparator);
    684	string_print_encoded(&ctx.fields[FeedFieldContent].str);
    685	putchar(FieldSeparator);
    686	fputs(contenttypes[ctx.contenttype], stdout);
    687	putchar(FieldSeparator);
    688	string_print_trimmed(&ctx.fields[FeedFieldId].str);
    689	putchar(FieldSeparator);
    690	string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
    691	putchar(FieldSeparator);
    692	string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
    693	putchar(FieldSeparator);
    694	string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
    695	putchar('\n');
    696
    697	if (ferror(stdout)) /* check for errors but do not flush */
    698		checkfileerror(stdout, "<stdout>", 'w');
    699}
    700
    701static int
    702istag(const char *name, size_t len, const char *name2, size_t len2)
    703{
    704	return (len == len2 && !strcasecmp(name, name2));
    705}
    706
    707static int
    708isattr(const char *name, size_t len, const char *name2, size_t len2)
    709{
    710	return (len == len2 && !strcasecmp(name, name2));
    711}
    712
    713static void
    714xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    715	const char *v, size_t vl)
    716{
    717	/* handles transforming inline XML to data */
    718	if (ISINCONTENT(ctx)) {
    719		if (ctx.contenttype == ContentTypeHTML)
    720			xmldata(p, v, vl);
    721		return;
    722	}
    723
    724	if (!ctx.tag.id)
    725		return;
    726
    727	/* content-type may be for Atom: text, xhtml, html or a mime-type.
    728	   for MRSS (media:description): plain, html. */
    729	if (ISCONTENTTAG(ctx)) {
    730		if (isattr(n, nl, STRP("type")))
    731			string_append(&attrtype, v, vl);
    732		return;
    733	}
    734
    735	if (ctx.feedtype == FeedTypeRSS) {
    736		if (ctx.tag.id == RSSTagEnclosure &&
    737		    isattr(n, nl, STRP("url"))) {
    738			string_append(&tmpstr, v, vl);
    739		} else if (ctx.tag.id == RSSTagGuid &&
    740		           isattr(n, nl, STRP("ispermalink"))) {
    741			string_append(&attrispermalink, v, vl);
    742		}
    743	} else if (ctx.feedtype == FeedTypeAtom) {
    744		if (ctx.tag.id == AtomTagLink) {
    745			if (isattr(n, nl, STRP("rel"))) {
    746				string_append(&attrrel, v, vl);
    747			} else if (isattr(n, nl, STRP("href"))) {
    748				string_append(&tmpstr, v, vl);
    749			}
    750		} else if (ctx.tag.id == AtomTagCategory &&
    751			   isattr(n, nl, STRP("term"))) {
    752			string_append(&tmpstr, v, vl);
    753		}
    754	}
    755}
    756
    757static void
    758xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
    759              const char *data, size_t datalen)
    760{
    761	char buf[8];
    762	int len;
    763
    764	/* handles transforming inline XML to data */
    765	if (ISINCONTENT(ctx)) {
    766		if (ctx.contenttype == ContentTypeHTML)
    767			xmldata(p, data, datalen);
    768		return;
    769	}
    770
    771	if (!ctx.tag.id)
    772		return;
    773
    774	/* try to translate entity, else just pass as data to
    775	 * xmlattr handler. */
    776	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    777		xmlattr(p, t, tl, n, nl, buf, (size_t)len);
    778	else
    779		xmlattr(p, t, tl, n, nl, data, datalen);
    780}
    781
    782static void
    783xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    784{
    785	if (ISINCONTENT(ctx)) {
    786		if (ctx.contenttype == ContentTypeHTML) {
    787			/* handles transforming inline XML to data */
    788			xmldata(p, "\"", 1);
    789			ctx.attrcount = 0;
    790		}
    791		return;
    792	}
    793}
    794
    795static void
    796xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
    797{
    798	if (ISINCONTENT(ctx)) {
    799		if (ctx.contenttype == ContentTypeHTML) {
    800			/* handles transforming inline XML to data */
    801			if (!ctx.attrcount)
    802				xmldata(p, " ", 1);
    803			ctx.attrcount++;
    804			xmldata(p, n, nl);
    805			xmldata(p, "=\"", 2);
    806		}
    807		return;
    808	}
    809
    810	if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
    811		string_clear(&attrispermalink);
    812	else if (attrrel.len && isattr(n, nl, STRP("rel")))
    813		string_clear(&attrrel);
    814	else if (attrtype.len && isattr(n, nl, STRP("type")))
    815		string_clear(&attrtype);
    816	else if (tmpstr.len &&
    817	    (isattr(n, nl, STRP("href")) ||
    818	     isattr(n, nl, STRP("term")) ||
    819	     isattr(n, nl, STRP("url"))))
    820		string_clear(&tmpstr); /* use the last value for multiple attribute values */
    821}
    822
    823static void
    824xmldata(XMLParser *p, const char *s, size_t len)
    825{
    826	if (!ctx.field)
    827		return;
    828
    829	if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    830		string_append(&tmpstr, s, len);
    831	else
    832		string_append(ctx.field, s, len);
    833}
    834
    835static void
    836xmldataentity(XMLParser *p, const char *data, size_t datalen)
    837{
    838	char buf[8];
    839	int len;
    840
    841	if (!ctx.field)
    842		return;
    843
    844	/* try to translate entity, else just pass as data to
    845	 * xmldata handler. */
    846	if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
    847		xmldata(p, buf, (size_t)len);
    848	else
    849		xmldata(p, data, datalen);
    850}
    851
    852static void
    853xmltagstart(XMLParser *p, const char *t, size_t tl)
    854{
    855	const FeedTag *f;
    856
    857	if (ISINCONTENT(ctx)) {
    858		if (ctx.contenttype == ContentTypeHTML) {
    859			ctx.attrcount = 0;
    860			xmldata(p, "<", 1);
    861			xmldata(p, t, tl);
    862		}
    863		return;
    864	}
    865
    866	/* start of RSS or Atom item / entry */
    867	if (ctx.feedtype == FeedTypeNone) {
    868		if (istag(t, tl, STRP("entry")))
    869			ctx.feedtype = FeedTypeAtom;
    870		else if (istag(t, tl, STRP("item")))
    871			ctx.feedtype = FeedTypeRSS;
    872		return;
    873	}
    874
    875	/* field tagid already set or nested tags. */
    876	if (ctx.tag.id) {
    877		/* nested <author><name> for Atom */
    878		if (ctx.tag.id == AtomTagAuthor &&
    879		    istag(t, tl, STRP("name"))) {
    880			memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
    881		} else {
    882			return; /* other nested tags are not allowed: return */
    883		}
    884	}
    885
    886	/* in item */
    887	if (ctx.tag.id == TagUnknown) {
    888		if (!(f = gettag(ctx.feedtype, t, tl)))
    889			f = &notag;
    890		memcpy(&(ctx.tag), f, sizeof(ctx.tag));
    891	}
    892
    893	ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
    894	string_clear(&attrispermalink);
    895	string_clear(&attrrel);
    896	string_clear(&attrtype);
    897}
    898
    899static void
    900xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
    901{
    902	enum TagId tagid;
    903
    904	if (ISINCONTENT(ctx)) {
    905		if (ctx.contenttype == ContentTypeHTML) {
    906			if (isshort)
    907				xmldata(p, "/>", 2);
    908			else
    909				xmldata(p, ">", 1);
    910		}
    911		return;
    912	}
    913
    914	/* set tag type based on its attribute value */
    915	if (ctx.tag.id == RSSTagGuid) {
    916		/* if empty the default is "true" */
    917		if (!attrispermalink.len ||
    918		    isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
    919			ctx.tag.id = RSSTagGuidPermalinkTrue;
    920		else
    921			ctx.tag.id = RSSTagGuidPermalinkFalse;
    922	} else if (ctx.tag.id == AtomTagLink) {
    923		/* empty or "alternate": other types could be
    924		   "enclosure", "related", "self" or "via" */
    925		if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
    926			ctx.tag.id = AtomTagLinkAlternate;
    927		else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
    928			ctx.tag.id = AtomTagLinkEnclosure;
    929		else
    930			ctx.tag.id = AtomTagLink; /* unknown */
    931	}
    932
    933	tagid = ctx.tag.id;
    934
    935	/* map tag type to field: unknown or lesser priority is ignored,
    936	   when tags of the same type are repeated only the first is used. */
    937	if (fieldmap[tagid] == -1 ||
    938	    (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
    939	     tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
    940		return;
    941	}
    942
    943	if (ctx.iscontenttag) {
    944		ctx.iscontent = 1;
    945		ctx.iscontenttag = 0;
    946
    947		/* detect content-type based on type attribute */
    948		if (attrtype.len) {
    949			if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
    950			    isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
    951			    isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
    952			    isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
    953			    isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
    954				ctx.contenttype = ContentTypeHTML;
    955			else /* unknown: handle as base64 text data */
    956				ctx.contenttype = ContentTypePlain;
    957		} else {
    958			/* default content-type */
    959			if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
    960				ctx.contenttype = ContentTypeHTML;
    961			else
    962				ctx.contenttype = ContentTypePlain;
    963		}
    964	}
    965
    966	ctx.field = &(ctx.fields[fieldmap[tagid]].str);
    967	ctx.fields[fieldmap[tagid]].tagid = tagid;
    968
    969	/* clear field if it is overwritten (with a priority order) for the new
    970	   value, if the field can have multiple values then do not clear it. */
    971	if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
    972		string_clear(ctx.field);
    973}
    974
    975static void
    976xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
    977{
    978	size_t i;
    979
    980	if (ctx.feedtype == FeedTypeNone)
    981		return;
    982
    983	if (ISINCONTENT(ctx)) {
    984		/* not a closed content field */
    985		if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    986			if (!isshort && ctx.contenttype == ContentTypeHTML) {
    987				xmldata(p, "</", 2);
    988				xmldata(p, t, tl);
    989				xmldata(p, ">", 1);
    990			}
    991			return;
    992		}
    993	} else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
    994		/* matched tag end: close it */
    995		/* copy also to the link field if the attribute isPermaLink="true"
    996		   and it is not set by a tag with higher priority. */
    997		if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
    998		    ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
    999			string_clear(&ctx.fields[FeedFieldLink].str);
   1000			string_append(&ctx.fields[FeedFieldLink].str,
   1001			              ctx.field->data, ctx.field->len);
   1002			ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
   1003		}
   1004	} else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
   1005	   istag(t, tl, STRP("entry"))) || /* Atom */
   1006	   (ctx.feedtype == FeedTypeRSS &&
   1007	   istag(t, tl, STRP("item"))))) /* RSS */
   1008	{
   1009		/* end of RSS or Atom entry / item */
   1010		printfields();
   1011
   1012		/* clear strings */
   1013		for (i = 0; i < FeedFieldLast; i++) {
   1014			string_clear(&ctx.fields[i].str);
   1015			ctx.fields[i].tagid = TagUnknown;
   1016		}
   1017		ctx.contenttype = ContentTypeNone;
   1018		/* allow parsing of Atom and RSS concatenated in one XML stream. */
   1019		ctx.feedtype = FeedTypeNone;
   1020	} else {
   1021		return; /* not end of field */
   1022	}
   1023
   1024	/* temporary string: for fields that cannot be processed
   1025	   directly and need more context, for example by its tag
   1026	   attributes, like the Atom link rel="alternate|enclosure". */
   1027	if (tmpstr.len && ctx.field) {
   1028		if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
   1029			if (ctx.field->len)
   1030				string_append(ctx.field, FieldMultiSeparator, 1);
   1031			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1032		} else {
   1033			string_clear(ctx.field);
   1034			string_append(ctx.field, tmpstr.data, tmpstr.len);
   1035		}
   1036	}
   1037
   1038	/* close field */
   1039	string_clear(&tmpstr); /* reuse and clear temporary string */
   1040
   1041	if (ctx.tag.id == AtomTagAuthorName)
   1042		memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
   1043	else
   1044		memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1045
   1046	ctx.iscontent = 0;
   1047	ctx.field = NULL;
   1048}
   1049
   1050int
   1051main(int argc, char *argv[])
   1052{
   1053	if (pledge("stdio", NULL) == -1)
   1054		err(1, "pledge");
   1055
   1056	if (argc > 1) {
   1057		if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
   1058			baseurl = argv[1];
   1059		else
   1060			errx(1, "baseurl incorrect or too long");
   1061	}
   1062
   1063	memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
   1064
   1065	parser.xmlattr = xmlattr;
   1066	parser.xmlattrentity = xmlattrentity;
   1067	parser.xmlattrend = xmlattrend;
   1068	parser.xmlattrstart = xmlattrstart;
   1069	parser.xmlcdata = xmldata;
   1070	parser.xmldata = xmldata;
   1071	parser.xmldataentity = xmldataentity;
   1072	parser.xmltagend = xmltagend;
   1073	parser.xmltagstart = xmltagstart;
   1074	parser.xmltagstartparsed = xmltagstartparsed;
   1075
   1076	/* NOTE: GETNEXT is defined in xml.h for inline optimization */
   1077	xml_parse(&parser);
   1078
   1079	checkfileerror(stdin, "<stdin>", 'r');
   1080	checkfileerror(stdout, "<stdout>", 'w');
   1081
   1082	return 0;
   1083}