sfeed.c (30040B)
1#include <errno.h> 2#include <stdint.h> 3#include <stdio.h> 4#include <stdlib.h> 5#include <string.h> 6#include <strings.h> 7 8#include "util.h" 9#include "xml.h" 10 11#define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag)) 12#define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag) 13 14/* these feed fields support multiple separated values */ 15#define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory) 16 17/* string and byte-length */ 18#define STRP(s) s,sizeof(s)-1 19 20enum FeedType { 21 FeedTypeNone = 0, 22 FeedTypeRSS = 1, 23 FeedTypeAtom = 2 24}; 25 26enum ContentType { 27 ContentTypeNone = 0, 28 ContentTypePlain = 1, 29 ContentTypeHTML = 2 30}; 31static const char *contenttypes[] = { "", "plain", "html" }; 32 33/* String data / memory pool */ 34typedef struct string { 35 char *data; /* data */ 36 size_t len; /* string length */ 37 size_t bufsiz; /* allocated size */ 38} String; 39 40/* NOTE: the order of these fields (content, date, author) indicate the 41 * priority to use them, from least important to high. */ 42enum TagId { 43 TagUnknown = 0, 44 /* RSS */ 45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */ 46 RSSTagTitle, 47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded, 48 RSSTagGuid, 49 RSSTagGuidPermalinkFalse, 50 RSSTagGuidPermalinkTrue, 51 /* must be defined after GUID, because it can be a link (isPermaLink) */ 52 RSSTagLink, 53 RSSTagEnclosure, 54 RSSTagAuthor, RSSTagDccreator, 55 RSSTagCategory, 56 /* Atom */ 57 /* creation date has higher priority */ 58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished, 59 AtomTagTitle, 60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent, 61 AtomTagId, 62 AtomTagLink, 63 AtomTagLinkAlternate, 64 AtomTagLinkEnclosure, 65 AtomTagAuthor, AtomTagAuthorName, 66 AtomTagCategory, 67 TagLast 68}; 69 70typedef struct feedtag { 71 char *name; /* name of tag to match */ 72 size_t len; /* len of `name` */ 73 enum TagId id; /* unique ID */ 74} FeedTag; 75 76typedef struct field { 77 String str; 78 enum TagId tagid; /* tagid set previously, used for tag priority */ 79} FeedField; 80 81enum { 82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent, 83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory, 84 FeedFieldLast 85}; 86 87typedef struct feedcontext { 88 String *field; /* current FeedItem field String */ 89 FeedField fields[FeedFieldLast]; /* data for current item */ 90 FeedTag tag; /* unique current parsed tag */ 91 int iscontent; /* in content data */ 92 int iscontenttag; /* in content tag */ 93 enum ContentType contenttype; /* content-type for item */ 94 enum FeedType feedtype; 95 int attrcount; /* count item HTML element attributes */ 96} FeedContext; 97 98static long long datetounix(long long, int, int, int, int, int); 99static FeedTag * gettag(enum FeedType, const char *, size_t); 100static long gettzoffset(const char *); 101static int isattr(const char *, size_t, const char *, size_t); 102static int istag(const char *, size_t, const char *, size_t); 103static int parsetime(const char *, long long *); 104static void printfields(void); 105static void string_append(String *, const char *, size_t); 106static void string_buffer_realloc(String *, size_t); 107static void string_clear(String *); 108static void string_print_encoded(String *); 109static void string_print_timestamp(String *); 110static void string_print_trimmed(String *); 111static void string_print_trimmed_multi(String *); 112static void string_print_uri(String *); 113static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t, 114 const char *, size_t); 115static void xmlattrentity(XMLParser *, const char *, size_t, const char *, 116 size_t, const char *, size_t); 117static void xmlattrend(XMLParser *, const char *, size_t, const char *, 118 size_t); 119static void xmlattrstart(XMLParser *, const char *, size_t, const char *, 120 size_t); 121static void xmldata(XMLParser *, const char *, size_t); 122static void xmldataentity(XMLParser *, const char *, size_t); 123static void xmltagend(XMLParser *, const char *, size_t, int); 124static void xmltagstart(XMLParser *, const char *, size_t); 125static void xmltagstartparsed(XMLParser *, const char *, size_t, int); 126 127/* map tag name to TagId type */ 128/* RSS, must be alphabetical order */ 129static const FeedTag rsstags[] = { 130 { STRP("author"), RSSTagAuthor }, 131 { STRP("category"), RSSTagCategory }, 132 { STRP("content:encoded"), RSSTagContentEncoded }, 133 { STRP("dc:creator"), RSSTagDccreator }, 134 { STRP("dc:date"), RSSTagDcdate }, 135 { STRP("description"), RSSTagDescription }, 136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */ 137 { STRP("enclosure"), RSSTagEnclosure }, 138 { STRP("guid"), RSSTagGuid }, 139 { STRP("link"), RSSTagLink }, 140 { STRP("media:description"), RSSTagMediaDescription }, 141 { STRP("pubdate"), RSSTagPubdate }, 142 { STRP("title"), RSSTagTitle } 143}; 144 145/* Atom, must be alphabetical order */ 146static const FeedTag atomtags[] = { 147 { STRP("author"), AtomTagAuthor }, 148 { STRP("category"), AtomTagCategory }, 149 { STRP("content"), AtomTagContent }, 150 { STRP("id"), AtomTagId }, 151 { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */ 152 /* Atom: <link href="" />, RSS has <link></link> */ 153 { STRP("link"), AtomTagLink }, 154 { STRP("media:description"), AtomTagMediaDescription }, 155 { STRP("modified"), AtomTagModified }, /* Atom 0.3 */ 156 { STRP("published"), AtomTagPublished }, 157 { STRP("summary"), AtomTagSummary }, 158 { STRP("title"), AtomTagTitle }, 159 { STRP("updated"), AtomTagUpdated } 160}; 161 162/* special case: nested <author><name> */ 163static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor }; 164static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName }; 165 166/* reference to no / unknown tag */ 167static const FeedTag notag = { STRP(""), TagUnknown }; 168 169/* map TagId type to RSS/Atom field, all tags must be defined */ 170static const int fieldmap[TagLast] = { 171 [TagUnknown] = -1, 172 /* RSS */ 173 [RSSTagDcdate] = FeedFieldTime, 174 [RSSTagPubdate] = FeedFieldTime, 175 [RSSTagTitle] = FeedFieldTitle, 176 [RSSTagMediaDescription] = FeedFieldContent, 177 [RSSTagDescription] = FeedFieldContent, 178 [RSSTagContentEncoded] = FeedFieldContent, 179 [RSSTagGuid] = -1, 180 [RSSTagGuidPermalinkFalse] = FeedFieldId, 181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special-case: both a link and an id */ 182 [RSSTagLink] = FeedFieldLink, 183 [RSSTagEnclosure] = FeedFieldEnclosure, 184 [RSSTagAuthor] = FeedFieldAuthor, 185 [RSSTagDccreator] = FeedFieldAuthor, 186 [RSSTagCategory] = FeedFieldCategory, 187 /* Atom */ 188 [AtomTagModified] = FeedFieldTime, 189 [AtomTagUpdated] = FeedFieldTime, 190 [AtomTagIssued] = FeedFieldTime, 191 [AtomTagPublished] = FeedFieldTime, 192 [AtomTagTitle] = FeedFieldTitle, 193 [AtomTagMediaDescription] = FeedFieldContent, 194 [AtomTagSummary] = FeedFieldContent, 195 [AtomTagContent] = FeedFieldContent, 196 [AtomTagId] = FeedFieldId, 197 [AtomTagLink] = -1, 198 [AtomTagLinkAlternate] = FeedFieldLink, 199 [AtomTagLinkEnclosure] = FeedFieldEnclosure, 200 [AtomTagAuthor] = -1, 201 [AtomTagAuthorName] = FeedFieldAuthor, 202 [AtomTagCategory] = FeedFieldCategory 203}; 204 205static const int FieldSeparator = '\t'; 206/* separator for multiple values in a field, separator should be 1 byte */ 207static const char FieldMultiSeparator[] = "|"; 208static struct uri baseuri; 209static const char *baseurl; 210 211static FeedContext ctx; 212static XMLParser parser; /* XML parser state */ 213static String attrispermalink, attrrel, attrtype, tmpstr; 214 215static int 216tagcmp(const void *v1, const void *v2) 217{ 218 return strcasecmp(((FeedTag *)v1)->name, ((FeedTag *)v2)->name); 219} 220 221/* Unique tagid for parsed tag name. */ 222static FeedTag * 223gettag(enum FeedType feedtype, const char *name, size_t namelen) 224{ 225 FeedTag f, *r = NULL; 226 227 f.name = (char *)name; 228 229 switch (feedtype) { 230 case FeedTypeRSS: 231 r = bsearch(&f, rsstags, sizeof(rsstags) / sizeof(rsstags[0]), 232 sizeof(rsstags[0]), tagcmp); 233 break; 234 case FeedTypeAtom: 235 r = bsearch(&f, atomtags, sizeof(atomtags) / sizeof(atomtags[0]), 236 sizeof(atomtags[0]), tagcmp); 237 break; 238 default: 239 break; 240 } 241 242 return r; 243} 244 245static char * 246ltrim(const char *s) 247{ 248 for (; ISSPACE((unsigned char)*s); s++) 249 ; 250 return (char *)s; 251} 252 253static char * 254rtrim(const char *s) 255{ 256 const char *e; 257 258 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--) 259 ; 260 return (char *)e; 261} 262 263/* Clear string only; don't free, prevents unnecessary reallocation. */ 264static void 265string_clear(String *s) 266{ 267 if (s->data) 268 s->data[0] = '\0'; 269 s->len = 0; 270} 271 272static void 273string_buffer_realloc(String *s, size_t newlen) 274{ 275 size_t alloclen; 276 277 if (newlen > SIZE_MAX / 2) { 278 alloclen = SIZE_MAX; 279 } else { 280 for (alloclen = 64; alloclen <= newlen; alloclen *= 2) 281 ; 282 } 283 if (!(s->data = realloc(s->data, alloclen))) 284 err(1, "realloc"); 285 s->bufsiz = alloclen; 286} 287 288/* Append data to String, s->data and data may not overlap. */ 289static void 290string_append(String *s, const char *data, size_t len) 291{ 292 if (!len) 293 return; 294 295 if (s->len >= SIZE_MAX - len) { 296 errno = ENOMEM; 297 err(1, "realloc"); 298 } 299 300 /* check if allocation is necessary, never shrink the buffer. */ 301 if (s->len + len >= s->bufsiz) 302 string_buffer_realloc(s, s->len + len + 1); 303 memcpy(s->data + s->len, data, len); 304 s->len += len; 305 s->data[s->len] = '\0'; 306} 307 308/* Print text, encode TABs, newlines and '\', remove other whitespace. 309 * Remove leading and trailing whitespace. */ 310static void 311string_print_encoded(String *s) 312{ 313 const char *p, *e; 314 315 if (!s->data || !s->len) 316 return; 317 318 p = ltrim(s->data); 319 e = rtrim(p); 320 321 for (; *p && p != e; p++) { 322 switch (*p) { 323 case '\n': putchar('\\'); putchar('n'); break; 324 case '\\': putchar('\\'); putchar('\\'); break; 325 case '\t': putchar('\\'); putchar('t'); break; 326 default: 327 /* ignore control chars */ 328 if (!ISCNTRL((unsigned char)*p)) 329 putchar(*p); 330 break; 331 } 332 } 333} 334 335static void 336printtrimmed(const char *s) 337{ 338 char *p, *e; 339 340 p = ltrim(s); 341 e = rtrim(p); 342 for (; *p && p != e; p++) { 343 if (ISSPACE((unsigned char)*p)) 344 putchar(' '); /* any whitespace to space */ 345 else if (!ISCNTRL((unsigned char)*p)) 346 /* ignore other control chars */ 347 putchar(*p); 348 } 349} 350 351/* Print text, replace TABs, carriage return and other whitespace with ' '. 352 * Other control chars are removed. Remove leading and trailing whitespace. */ 353static void 354string_print_trimmed(String *s) 355{ 356 if (!s->data || !s->len) 357 return; 358 359 printtrimmed(s->data); 360} 361 362/* Print each field with trimmed whitespace, separated by '|'. */ 363static void 364string_print_trimmed_multi(String *s) 365{ 366 char *p, *e; 367 int c; 368 369 if (!s->data || !s->len) 370 return; 371 372 for (p = s->data; ; p = e + 1) { 373 if ((e = strstr(p, FieldMultiSeparator))) { 374 c = *e; 375 *e = '\0'; 376 printtrimmed(p); 377 *e = c; /* restore NUL byte to original character */ 378 fputs(FieldMultiSeparator, stdout); 379 } else { 380 printtrimmed(p); 381 break; 382 } 383 } 384} 385 386/* Print URL, if it is a relative URL then it uses the global `baseurl`. */ 387static void 388printuri(char *s) 389{ 390 char link[4096], *p, *e; 391 struct uri newuri, olduri; 392 int c, r = -1; 393 394 p = ltrim(s); 395 e = rtrim(p); 396 c = *e; 397 *e = '\0'; 398 399 if (baseurl && !uri_hasscheme(p) && 400 uri_parse(p, &olduri) != -1 && !olduri.proto[0] && 401 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0]) 402 r = uri_format(link, sizeof(link), &newuri); 403 404 if (r >= 0 && (size_t)r < sizeof(link)) 405 printtrimmed(link); 406 else 407 printtrimmed(p); 408 409 *e = c; /* restore NUL byte to original character */ 410} 411 412/* Print URL, if it is a relative URL then it uses the global `baseurl`. */ 413static void 414string_print_uri(String *s) 415{ 416 if (!s->data || !s->len) 417 return; 418 419 printuri(s->data); 420} 421 422/* Print as UNIX timestamp, print nothing if the time is empty or invalid. */ 423static void 424string_print_timestamp(String *s) 425{ 426 long long t; 427 428 if (!s->data || !s->len) 429 return; 430 431 if (parsetime(s->data, &t) != -1) 432 printf("%lld", t); 433} 434 435/* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp. 436 Parameters should be passed as they are in a struct tm: 437 that is: year = year - 1900, month = month - 1. */ 438static long long 439datetounix(long long year, int mon, int day, int hour, int min, int sec) 440{ 441 /* seconds in a month in a regular (non-leap) year */ 442 static const long secs_through_month[] = { 443 0, 31 * 86400, 59 * 86400, 90 * 86400, 444 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400, 445 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 }; 446 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem; 447 long long t; 448 449 /* optimization: handle common range year 1902 up to and including 2038 */ 450 if (year - 2ULL <= 136) { 451 /* amount of leap days relative to 1970: every 4 years */ 452 leaps = (year - 68) >> 2; 453 if (!((year - 68) & 3)) { 454 leaps--; 455 is_leap = 1; 456 } else { 457 is_leap = 0; 458 } 459 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */ 460 } else { 461 /* general leap year calculation: 462 leap years occur mostly every 4 years but every 100 years 463 a leap year is skipped unless the year is divisible by 400 */ 464 cycles = (year - 100) / 400; 465 rem = (year - 100) % 400; 466 if (rem < 0) { 467 cycles--; 468 rem += 400; 469 } 470 if (!rem) { 471 is_leap = 1; 472 } else { 473 if (rem >= 300) { 474 centuries = 3; 475 rem -= 300; 476 } else if (rem >= 200) { 477 centuries = 2; 478 rem -= 200; 479 } else if (rem >= 100) { 480 centuries = 1; 481 rem -= 100; 482 } 483 if (rem) { 484 leaps = rem / 4U; 485 rem %= 4U; 486 is_leap = !rem; 487 } 488 } 489 leaps += (97 * cycles) + (24 * centuries) - is_leap; 490 491 /* adjust 8 leap days from 1970 up to and including 2000: 492 ((30 * 365) + 8) * 86400 = 946771200 */ 493 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL; 494 } 495 t += secs_through_month[mon]; 496 if (is_leap && mon >= 2) 497 t += 86400; 498 t += 86400LL * (day - 1); 499 t += 3600LL * hour; 500 t += 60LL * min; 501 t += sec; 502 503 return t; 504} 505 506/* Get timezone from string, return time offset in seconds from UTC. 507 * NOTE: only parses timezones in RFC 822, many other timezone names are 508 * ambiguous anyway. 509 * ANSI and military zones are defined wrong in RFC 822 and are unsupported, 510 * see note on RFC 2822 4.3 page 32. */ 511static long 512gettzoffset(const char *s) 513{ 514 static const struct { 515 char *name; 516 int offhour; 517 } tzones[] = { 518 { "CDT", -5 * 3600 }, 519 { "CST", -6 * 3600 }, 520 { "EDT", -4 * 3600 }, 521 { "EST", -5 * 3600 }, 522 { "MDT", -6 * 3600 }, 523 { "MST", -7 * 3600 }, 524 { "PDT", -7 * 3600 }, 525 { "PST", -8 * 3600 }, 526 }; 527 const char *p; 528 long tzhour = 0, tzmin = 0; 529 size_t i; 530 531 for (; ISSPACE((unsigned char)*s); s++) 532 ; 533 switch (*s) { 534 case '-': /* offset */ 535 case '+': 536 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 537 tzhour = (tzhour * 10) + (*p - '0'); 538 if (*p == ':') 539 p++; 540 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++) 541 tzmin = (tzmin * 10) + (*p - '0'); 542 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1); 543 default: /* timezone name */ 544 for (i = 0; ISALPHA((unsigned char)s[i]); i++) 545 ; 546 if (i != 3) 547 return 0; 548 /* compare timezone and adjust offset relative to UTC */ 549 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) { 550 if (!memcmp(s, tzones[i].name, 3)) 551 return tzones[i].offhour; 552 } 553 } 554 return 0; 555} 556 557/* Parse time string `s` into the UNIX timestamp `tp`. 558 Returns 0 on success or -1 on failure. */ 559static int 560parsetime(const char *s, long long *tp) 561{ 562 static const struct { 563 char *name; 564 int len; 565 } mons[] = { 566 { STRP("January"), }, 567 { STRP("February"), }, 568 { STRP("March"), }, 569 { STRP("April"), }, 570 { STRP("May"), }, 571 { STRP("June"), }, 572 { STRP("July"), }, 573 { STRP("August"), }, 574 { STRP("September"), }, 575 { STRP("October"), }, 576 { STRP("November"), }, 577 { STRP("December"), }, 578 }; 579 int va[6] = { 0 }, i, j, v, vi; 580 size_t m; 581 582 for (; ISSPACE((unsigned char)*s); s++) 583 ; 584 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s)) 585 return -1; 586 587 if (ISDIGIT((unsigned char)s[0]) && 588 ISDIGIT((unsigned char)s[1]) && 589 ISDIGIT((unsigned char)s[2]) && 590 ISDIGIT((unsigned char)s[3])) { 591 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */ 592 vi = 0; 593 } else { 594 /* format: "[%a, ]%d %b %Y %H:%M:%S" */ 595 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */ 596 for (; ISALPHA((unsigned char)*s); s++) 597 ; 598 for (; ISSPACE((unsigned char)*s); s++) 599 ; 600 if (*s == ',') 601 s++; 602 for (; ISSPACE((unsigned char)*s); s++) 603 ; 604 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++) 605 v = (v * 10) + (*s - '0'); 606 va[2] = v; /* day */ 607 for (; ISSPACE((unsigned char)*s); s++) 608 ; 609 /* end of word month */ 610 for (j = 0; ISALPHA((unsigned char)s[j]); j++) 611 ; 612 /* check month name */ 613 if (j < 3 || j > 9) 614 return -1; /* month cannot match */ 615 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) { 616 /* abbreviation (3 length) or long name */ 617 if ((j == 3 || j == mons[m].len) && 618 !strncasecmp(mons[m].name, s, j)) { 619 va[1] = m + 1; 620 s += j; 621 break; 622 } 623 } 624 if (m >= 12) 625 return -1; /* no month found */ 626 for (; ISSPACE((unsigned char)*s); s++) 627 ; 628 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++) 629 v = (v * 10) + (*s - '0'); 630 /* obsolete short year: RFC 2822 4.3 */ 631 if (i == 2 || i == 3) 632 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900; 633 va[0] = v; /* year */ 634 for (; ISSPACE((unsigned char)*s); s++) 635 ; 636 /* parse only regular time part, see below */ 637 vi = 3; 638 } 639 640 /* parse time parts (and possibly remaining date parts) */ 641 for (; *s && vi < 6; vi++) { 642 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) && 643 ISDIGIT((unsigned char)*s); s++, i++) { 644 v = (v * 10) + (*s - '0'); 645 } 646 va[vi] = v; 647 648 if ((vi < 2 && *s == '-') || 649 (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) || 650 (vi > 2 && *s == ':')) 651 s++; 652 } 653 654 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */ 655 if (*s == '.') { 656 for (s++; ISDIGIT((unsigned char)*s); s++) 657 ; 658 } 659 660 /* invalid range */ 661 if (va[0] < 0 || va[0] > 9999 || 662 va[1] < 1 || va[1] > 12 || 663 va[2] < 1 || va[2] > 31 || 664 va[3] < 0 || va[3] > 23 || 665 va[4] < 0 || va[4] > 59 || 666 va[5] < 0 || va[5] > 60) /* allow leap second */ 667 return -1; 668 669 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) - 670 gettzoffset(s); 671 672 return 0; 673} 674 675static void 676printfields(void) 677{ 678 string_print_timestamp(&ctx.fields[FeedFieldTime].str); 679 putchar(FieldSeparator); 680 string_print_trimmed(&ctx.fields[FeedFieldTitle].str); 681 putchar(FieldSeparator); 682 string_print_uri(&ctx.fields[FeedFieldLink].str); 683 putchar(FieldSeparator); 684 string_print_encoded(&ctx.fields[FeedFieldContent].str); 685 putchar(FieldSeparator); 686 fputs(contenttypes[ctx.contenttype], stdout); 687 putchar(FieldSeparator); 688 string_print_trimmed(&ctx.fields[FeedFieldId].str); 689 putchar(FieldSeparator); 690 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str); 691 putchar(FieldSeparator); 692 string_print_uri(&ctx.fields[FeedFieldEnclosure].str); 693 putchar(FieldSeparator); 694 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str); 695 putchar('\n'); 696 697 if (ferror(stdout)) /* check for errors but do not flush */ 698 checkfileerror(stdout, "<stdout>", 'w'); 699} 700 701static int 702istag(const char *name, size_t len, const char *name2, size_t len2) 703{ 704 return (len == len2 && !strcasecmp(name, name2)); 705} 706 707static int 708isattr(const char *name, size_t len, const char *name2, size_t len2) 709{ 710 return (len == len2 && !strcasecmp(name, name2)); 711} 712 713static void 714xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 715 const char *v, size_t vl) 716{ 717 /* handles transforming inline XML to data */ 718 if (ISINCONTENT(ctx)) { 719 if (ctx.contenttype == ContentTypeHTML) 720 xmldata(p, v, vl); 721 return; 722 } 723 724 if (!ctx.tag.id) 725 return; 726 727 /* content-type may be for Atom: text, xhtml, html or a mime-type. 728 for MRSS (media:description): plain, html. */ 729 if (ISCONTENTTAG(ctx)) { 730 if (isattr(n, nl, STRP("type"))) 731 string_append(&attrtype, v, vl); 732 return; 733 } 734 735 if (ctx.feedtype == FeedTypeRSS) { 736 if (ctx.tag.id == RSSTagEnclosure && 737 isattr(n, nl, STRP("url"))) { 738 string_append(&tmpstr, v, vl); 739 } else if (ctx.tag.id == RSSTagGuid && 740 isattr(n, nl, STRP("ispermalink"))) { 741 string_append(&attrispermalink, v, vl); 742 } 743 } else if (ctx.feedtype == FeedTypeAtom) { 744 if (ctx.tag.id == AtomTagLink) { 745 if (isattr(n, nl, STRP("rel"))) { 746 string_append(&attrrel, v, vl); 747 } else if (isattr(n, nl, STRP("href"))) { 748 string_append(&tmpstr, v, vl); 749 } 750 } else if (ctx.tag.id == AtomTagCategory && 751 isattr(n, nl, STRP("term"))) { 752 string_append(&tmpstr, v, vl); 753 } 754 } 755} 756 757static void 758xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, 759 const char *data, size_t datalen) 760{ 761 char buf[8]; 762 int len; 763 764 /* handles transforming inline XML to data */ 765 if (ISINCONTENT(ctx)) { 766 if (ctx.contenttype == ContentTypeHTML) 767 xmldata(p, data, datalen); 768 return; 769 } 770 771 if (!ctx.tag.id) 772 return; 773 774 /* try to translate entity, else just pass as data to 775 * xmlattr handler. */ 776 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 777 xmlattr(p, t, tl, n, nl, buf, (size_t)len); 778 else 779 xmlattr(p, t, tl, n, nl, data, datalen); 780} 781 782static void 783xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 784{ 785 if (ISINCONTENT(ctx)) { 786 if (ctx.contenttype == ContentTypeHTML) { 787 /* handles transforming inline XML to data */ 788 xmldata(p, "\"", 1); 789 ctx.attrcount = 0; 790 } 791 return; 792 } 793} 794 795static void 796xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) 797{ 798 if (ISINCONTENT(ctx)) { 799 if (ctx.contenttype == ContentTypeHTML) { 800 /* handles transforming inline XML to data */ 801 if (!ctx.attrcount) 802 xmldata(p, " ", 1); 803 ctx.attrcount++; 804 xmldata(p, n, nl); 805 xmldata(p, "=\"", 2); 806 } 807 return; 808 } 809 810 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink"))) 811 string_clear(&attrispermalink); 812 else if (attrrel.len && isattr(n, nl, STRP("rel"))) 813 string_clear(&attrrel); 814 else if (attrtype.len && isattr(n, nl, STRP("type"))) 815 string_clear(&attrtype); 816 else if (tmpstr.len && 817 (isattr(n, nl, STRP("href")) || 818 isattr(n, nl, STRP("term")) || 819 isattr(n, nl, STRP("url")))) 820 string_clear(&tmpstr); /* use the last value for multiple attribute values */ 821} 822 823static void 824xmldata(XMLParser *p, const char *s, size_t len) 825{ 826 if (!ctx.field) 827 return; 828 829 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 830 string_append(&tmpstr, s, len); 831 else 832 string_append(ctx.field, s, len); 833} 834 835static void 836xmldataentity(XMLParser *p, const char *data, size_t datalen) 837{ 838 char buf[8]; 839 int len; 840 841 if (!ctx.field) 842 return; 843 844 /* try to translate entity, else just pass as data to 845 * xmldata handler. */ 846 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0) 847 xmldata(p, buf, (size_t)len); 848 else 849 xmldata(p, data, datalen); 850} 851 852static void 853xmltagstart(XMLParser *p, const char *t, size_t tl) 854{ 855 const FeedTag *f; 856 857 if (ISINCONTENT(ctx)) { 858 if (ctx.contenttype == ContentTypeHTML) { 859 ctx.attrcount = 0; 860 xmldata(p, "<", 1); 861 xmldata(p, t, tl); 862 } 863 return; 864 } 865 866 /* start of RSS or Atom item / entry */ 867 if (ctx.feedtype == FeedTypeNone) { 868 if (istag(t, tl, STRP("entry"))) 869 ctx.feedtype = FeedTypeAtom; 870 else if (istag(t, tl, STRP("item"))) 871 ctx.feedtype = FeedTypeRSS; 872 return; 873 } 874 875 /* field tagid already set or nested tags. */ 876 if (ctx.tag.id) { 877 /* nested <author><name> for Atom */ 878 if (ctx.tag.id == AtomTagAuthor && 879 istag(t, tl, STRP("name"))) { 880 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag)); 881 } else { 882 return; /* other nested tags are not allowed: return */ 883 } 884 } 885 886 /* in item */ 887 if (ctx.tag.id == TagUnknown) { 888 if (!(f = gettag(ctx.feedtype, t, tl))) 889 f = ¬ag; 890 memcpy(&(ctx.tag), f, sizeof(ctx.tag)); 891 } 892 893 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent); 894 string_clear(&attrispermalink); 895 string_clear(&attrrel); 896 string_clear(&attrtype); 897} 898 899static void 900xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) 901{ 902 enum TagId tagid; 903 904 if (ISINCONTENT(ctx)) { 905 if (ctx.contenttype == ContentTypeHTML) { 906 if (isshort) 907 xmldata(p, "/>", 2); 908 else 909 xmldata(p, ">", 1); 910 } 911 return; 912 } 913 914 /* set tag type based on its attribute value */ 915 if (ctx.tag.id == RSSTagGuid) { 916 /* if empty the default is "true" */ 917 if (!attrispermalink.len || 918 isattr(attrispermalink.data, attrispermalink.len, STRP("true"))) 919 ctx.tag.id = RSSTagGuidPermalinkTrue; 920 else 921 ctx.tag.id = RSSTagGuidPermalinkFalse; 922 } else if (ctx.tag.id == AtomTagLink) { 923 /* empty or "alternate": other types could be 924 "enclosure", "related", "self" or "via" */ 925 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate"))) 926 ctx.tag.id = AtomTagLinkAlternate; 927 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure"))) 928 ctx.tag.id = AtomTagLinkEnclosure; 929 else 930 ctx.tag.id = AtomTagLink; /* unknown */ 931 } 932 933 tagid = ctx.tag.id; 934 935 /* map tag type to field: unknown or lesser priority is ignored, 936 when tags of the same type are repeated only the first is used. */ 937 if (fieldmap[tagid] == -1 || 938 (!ISFEEDFIELDMULTI(fieldmap[tagid]) && 939 tagid <= ctx.fields[fieldmap[tagid]].tagid)) { 940 return; 941 } 942 943 if (ctx.iscontenttag) { 944 ctx.iscontent = 1; 945 ctx.iscontenttag = 0; 946 947 /* detect content-type based on type attribute */ 948 if (attrtype.len) { 949 if (isattr(attrtype.data, attrtype.len, STRP("html")) || 950 isattr(attrtype.data, attrtype.len, STRP("xhtml")) || 951 isattr(attrtype.data, attrtype.len, STRP("text/html")) || 952 isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) || 953 isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml"))) 954 ctx.contenttype = ContentTypeHTML; 955 else /* unknown: handle as base64 text data */ 956 ctx.contenttype = ContentTypePlain; 957 } else { 958 /* default content-type */ 959 if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription) 960 ctx.contenttype = ContentTypeHTML; 961 else 962 ctx.contenttype = ContentTypePlain; 963 } 964 } 965 966 ctx.field = &(ctx.fields[fieldmap[tagid]].str); 967 ctx.fields[fieldmap[tagid]].tagid = tagid; 968 969 /* clear field if it is overwritten (with a priority order) for the new 970 value, if the field can have multiple values then do not clear it. */ 971 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) 972 string_clear(ctx.field); 973} 974 975static void 976xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) 977{ 978 size_t i; 979 980 if (ctx.feedtype == FeedTypeNone) 981 return; 982 983 if (ISINCONTENT(ctx)) { 984 /* not a closed content field */ 985 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) { 986 if (!isshort && ctx.contenttype == ContentTypeHTML) { 987 xmldata(p, "</", 2); 988 xmldata(p, t, tl); 989 xmldata(p, ">", 1); 990 } 991 return; 992 } 993 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) { 994 /* matched tag end: close it */ 995 /* copy also to the link field if the attribute isPermaLink="true" 996 and it is not set by a tag with higher priority. */ 997 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field && 998 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) { 999 string_clear(&ctx.fields[FeedFieldLink].str); 1000 string_append(&ctx.fields[FeedFieldLink].str, 1001 ctx.field->data, ctx.field->len); 1002 ctx.fields[FeedFieldLink].tagid = ctx.tag.id; 1003 } 1004 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom && 1005 istag(t, tl, STRP("entry"))) || /* Atom */ 1006 (ctx.feedtype == FeedTypeRSS && 1007 istag(t, tl, STRP("item"))))) /* RSS */ 1008 { 1009 /* end of RSS or Atom entry / item */ 1010 printfields(); 1011 1012 /* clear strings */ 1013 for (i = 0; i < FeedFieldLast; i++) { 1014 string_clear(&ctx.fields[i].str); 1015 ctx.fields[i].tagid = TagUnknown; 1016 } 1017 ctx.contenttype = ContentTypeNone; 1018 /* allow parsing of Atom and RSS concatenated in one XML stream. */ 1019 ctx.feedtype = FeedTypeNone; 1020 } else { 1021 return; /* not end of field */ 1022 } 1023 1024 /* temporary string: for fields that cannot be processed 1025 directly and need more context, for example by its tag 1026 attributes, like the Atom link rel="alternate|enclosure". */ 1027 if (tmpstr.len && ctx.field) { 1028 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) { 1029 if (ctx.field->len) 1030 string_append(ctx.field, FieldMultiSeparator, 1); 1031 string_append(ctx.field, tmpstr.data, tmpstr.len); 1032 } else { 1033 string_clear(ctx.field); 1034 string_append(ctx.field, tmpstr.data, tmpstr.len); 1035 } 1036 } 1037 1038 /* close field */ 1039 string_clear(&tmpstr); /* reuse and clear temporary string */ 1040 1041 if (ctx.tag.id == AtomTagAuthorName) 1042 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */ 1043 else 1044 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1045 1046 ctx.iscontent = 0; 1047 ctx.field = NULL; 1048} 1049 1050int 1051main(int argc, char *argv[]) 1052{ 1053 if (pledge("stdio", NULL) == -1) 1054 err(1, "pledge"); 1055 1056 if (argc > 1) { 1057 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0]) 1058 baseurl = argv[1]; 1059 else 1060 errx(1, "baseurl incorrect or too long"); 1061 } 1062 1063 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag)); 1064 1065 parser.xmlattr = xmlattr; 1066 parser.xmlattrentity = xmlattrentity; 1067 parser.xmlattrend = xmlattrend; 1068 parser.xmlattrstart = xmlattrstart; 1069 parser.xmlcdata = xmldata; 1070 parser.xmldata = xmldata; 1071 parser.xmldataentity = xmldataentity; 1072 parser.xmltagend = xmltagend; 1073 parser.xmltagstart = xmltagstart; 1074 parser.xmltagstartparsed = xmltagstartparsed; 1075 1076 /* NOTE: GETNEXT is defined in xml.h for inline optimization */ 1077 xml_parse(&parser); 1078 1079 checkfileerror(stdin, "<stdin>", 'r'); 1080 checkfileerror(stdout, "<stdout>", 'w'); 1081 1082 return 0; 1083}