sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

sfeed_update (6655B)


      1#!/bin/sh
      2# update feeds, merge with old feeds.
      3# NOTE: assumes "sfeed_*" executables are in $PATH.
      4
      5# defaults
      6sfeedpath="$HOME/.sfeed/feeds"
      7
      8# used for processing feeds concurrently: wait until ${maxjobs} amount of
      9# feeds are finished at a time.
     10maxjobs=16
     11
     12# load config (evaluate shellscript).
     13# loadconfig(configfile)
     14loadconfig() {
     15	# allow to specify config via argv[1].
     16	if [ "$1" != "" ]; then
     17		# get absolute path of config file required for including.
     18		config="$1"
     19		configpath=$(readlink -f "${config}" 2>/dev/null)
     20	else
     21		# default config location.
     22		config="$HOME/.sfeed/sfeedrc"
     23		configpath="${config}"
     24	fi
     25
     26	# config is loaded here to be able to override $sfeedpath or functions.
     27	if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
     28		. "${configpath}"
     29	else
     30		printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
     31		echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
     32		die
     33	fi
     34}
     35
     36# log(name, s)
     37log() {
     38	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
     39}
     40
     41# log_error(name, s)
     42log_error() {
     43	printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
     44	# set error exit status indicator for parallel jobs.
     45	rm -f "${sfeedtmpdir}/ok"
     46}
     47
     48# fetch a feed via HTTP/HTTPS etc.
     49# fetch(name, url, feedfile)
     50fetch() {
     51	# fail on redirects, hide User-Agent, timeout is 15 seconds.
     52	curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
     53		"$2" 2>/dev/null
     54}
     55
     56# convert encoding from one encoding to another.
     57# convertencoding(name, from, to)
     58convertencoding() {
     59	if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
     60		iconv -cs -f "$2" -t "$3" 2> /dev/null
     61	else
     62		# else no convert, just output.
     63		cat
     64	fi
     65}
     66
     67# parse and convert input, by default XML to the sfeed(5) TSV format.
     68# parse(name, feedurl, basesiteurl)
     69parse() {
     70	sfeed "$3"
     71}
     72
     73# filter fields.
     74# filter(name, url)
     75filter() {
     76	cat
     77}
     78
     79# merge raw files: unique sort by id, title, link.
     80# merge(name, oldfile, newfile)
     81merge() {
     82	sort -t '	' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null
     83}
     84
     85# order by timestamp (descending).
     86# order(name, url)
     87order() {
     88	sort -t '	' -k1rn,1 2>/dev/null
     89}
     90
     91# internal handler to fetch and process a feed.
     92# _feed(name, feedurl, [basesiteurl], [encoding])
     93_feed() {
     94	name="$1"
     95	feedurl="$2"
     96	basesiteurl="$3"
     97	encoding="$4"
     98
     99	filename="$(printf '%s' "${name}" | tr '/' '_')"
    100	sfeedfile="${sfeedpath}/${filename}"
    101	tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
    102
    103	# if file does not exist yet create it.
    104	[ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
    105
    106	if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
    107		log_error "${name}" "FAIL (FETCH)"
    108		return 1
    109	fi
    110
    111	# try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
    112	[ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
    113
    114	if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
    115		log_error "${name}" "FAIL (ENCODING)"
    116		return 1
    117	fi
    118	rm -f "${tmpfeedfile}.fetch"
    119
    120	# if baseurl is empty then use feedurl.
    121	if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
    122		log_error "${name}" "FAIL (PARSE)"
    123		return 1
    124	fi
    125	rm -f "${tmpfeedfile}.utf8"
    126
    127	if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
    128		log_error "${name}" "FAIL (FILTER)"
    129		return 1
    130	fi
    131	rm -f "${tmpfeedfile}.tsv"
    132
    133	# new feed data is empty: no need for below stages.
    134	if [ ! -s "${tmpfeedfile}.filter" ]; then
    135		log "${name}" "OK"
    136		return 0
    137	fi
    138
    139	if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
    140		log_error "${name}" "FAIL (MERGE)"
    141		return 1
    142	fi
    143	rm -f "${tmpfeedfile}.filter"
    144
    145	if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
    146		log_error "${name}" "FAIL (ORDER)"
    147		return 1
    148	fi
    149	rm -f "${tmpfeedfile}.merge"
    150
    151	# copy
    152	if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
    153		log_error "${name}" "FAIL (COPY)"
    154		return 1
    155	fi
    156	rm -f "${tmpfeedfile}.order"
    157
    158	# OK
    159	log "${name}" "OK"
    160	return 0
    161}
    162
    163# fetch and process a feed in parallel.
    164# feed(name, feedurl, [basesiteurl], [encoding])
    165feed() {
    166	# Output job parameters for xargs.
    167	# Specify fields as a single parameter separated by a NUL byte.
    168	# The parameter is split into fields later by the child process, this
    169	# allows using xargs with empty fields across many implementations.
    170	printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
    171		"${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
    172}
    173
    174# cleanup()
    175cleanup() {
    176	# remove temporary directory with feed files.
    177	rm -rf "${sfeedtmpdir}"
    178}
    179
    180# die(statuscode)
    181die() {
    182	statuscode="${1:-1}" # default: exit 1
    183	# cleanup temporary files etc.
    184	cleanup
    185	exit "${statuscode}"
    186}
    187
    188# sighandler(signo)
    189sighandler() {
    190	signo="$1"
    191	# ignore TERM signal for myself.
    192	trap -- "" TERM
    193	# kill all running children >:D
    194	kill -TERM -$$
    195}
    196
    197# feeds()
    198feeds() {
    199	printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
    200	echo "See sfeedrc.example for an example." >&2
    201	die
    202}
    203
    204# main(args...)
    205main() {
    206	# signal number received for parent.
    207	signo=0
    208	# SIGINT: signal to interrupt parent.
    209	trap -- "sighandler 2" "INT"
    210	# SIGTERM: signal to terminate parent.
    211	trap -- "sighandler 15" "TERM"
    212	# load config file.
    213	loadconfig "$1"
    214	# fetch feeds and store in temporary directory.
    215	sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
    216	mkdir -p "${sfeedtmpdir}/feeds"
    217	touch "${sfeedtmpdir}/ok" || die
    218	# make sure path exists.
    219	mkdir -p "${sfeedpath}"
    220
    221	# print feeds for parallel processing with xargs.
    222	feeds > "${sfeedtmpdir}/jobs" || die
    223	SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
    224		"$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
    225	statuscode=$?
    226
    227	# check error exit status indicator for parallel jobs.
    228	[ -f "${sfeedtmpdir}/ok" ] || statuscode=1
    229	# on signal SIGINT and SIGTERM exit with signal number + 128.
    230	[ ${signo} -ne 0 ] && die $((signo+128))
    231	die ${statuscode}
    232}
    233
    234# process a single feed.
    235# parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
    236if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
    237	IFS="" # "\037"
    238	[ "$1" = "" ] && exit 0 # must have an argument set
    239	printf '%s\n' "$1" | \
    240	while read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do
    241		loadconfig "${_config}"
    242		sfeedtmpdir="${_tmpdir}"
    243		_feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}"
    244		exit "$?"
    245	done
    246	exit 0
    247fi
    248
    249# ...else parent mode:
    250argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
    251[ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"