sfeed_update (6655B)
1#!/bin/sh 2# update feeds, merge with old feeds. 3# NOTE: assumes "sfeed_*" executables are in $PATH. 4 5# defaults 6sfeedpath="$HOME/.sfeed/feeds" 7 8# used for processing feeds concurrently: wait until ${maxjobs} amount of 9# feeds are finished at a time. 10maxjobs=16 11 12# load config (evaluate shellscript). 13# loadconfig(configfile) 14loadconfig() { 15 # allow to specify config via argv[1]. 16 if [ "$1" != "" ]; then 17 # get absolute path of config file required for including. 18 config="$1" 19 configpath=$(readlink -f "${config}" 2>/dev/null) 20 else 21 # default config location. 22 config="$HOME/.sfeed/sfeedrc" 23 configpath="${config}" 24 fi 25 26 # config is loaded here to be able to override $sfeedpath or functions. 27 if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then 28 . "${configpath}" 29 else 30 printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2 31 echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2 32 die 33 fi 34} 35 36# log(name, s) 37log() { 38 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" 39} 40 41# log_error(name, s) 42log_error() { 43 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 44 # set error exit status indicator for parallel jobs. 45 rm -f "${sfeedtmpdir}/ok" 46} 47 48# fetch a feed via HTTP/HTTPS etc. 49# fetch(name, url, feedfile) 50fetch() { 51 # fail on redirects, hide User-Agent, timeout is 15 seconds. 52 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ 53 "$2" 2>/dev/null 54} 55 56# convert encoding from one encoding to another. 57# convertencoding(name, from, to) 58convertencoding() { 59 if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then 60 iconv -cs -f "$2" -t "$3" 2> /dev/null 61 else 62 # else no convert, just output. 63 cat 64 fi 65} 66 67# parse and convert input, by default XML to the sfeed(5) TSV format. 68# parse(name, feedurl, basesiteurl) 69parse() { 70 sfeed "$3" 71} 72 73# filter fields. 74# filter(name, url) 75filter() { 76 cat 77} 78 79# merge raw files: unique sort by id, title, link. 80# merge(name, oldfile, newfile) 81merge() { 82 sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null 83} 84 85# order by timestamp (descending). 86# order(name, url) 87order() { 88 sort -t ' ' -k1rn,1 2>/dev/null 89} 90 91# internal handler to fetch and process a feed. 92# _feed(name, feedurl, [basesiteurl], [encoding]) 93_feed() { 94 name="$1" 95 feedurl="$2" 96 basesiteurl="$3" 97 encoding="$4" 98 99 filename="$(printf '%s' "${name}" | tr '/' '_')" 100 sfeedfile="${sfeedpath}/${filename}" 101 tmpfeedfile="${sfeedtmpdir}/feeds/${filename}" 102 103 # if file does not exist yet create it. 104 [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null 105 106 if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then 107 log_error "${name}" "FAIL (FETCH)" 108 return 1 109 fi 110 111 # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8. 112 [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch") 113 114 if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then 115 log_error "${name}" "FAIL (ENCODING)" 116 return 1 117 fi 118 rm -f "${tmpfeedfile}.fetch" 119 120 # if baseurl is empty then use feedurl. 121 if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then 122 log_error "${name}" "FAIL (PARSE)" 123 return 1 124 fi 125 rm -f "${tmpfeedfile}.utf8" 126 127 if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then 128 log_error "${name}" "FAIL (FILTER)" 129 return 1 130 fi 131 rm -f "${tmpfeedfile}.tsv" 132 133 # new feed data is empty: no need for below stages. 134 if [ ! -s "${tmpfeedfile}.filter" ]; then 135 log "${name}" "OK" 136 return 0 137 fi 138 139 if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then 140 log_error "${name}" "FAIL (MERGE)" 141 return 1 142 fi 143 rm -f "${tmpfeedfile}.filter" 144 145 if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then 146 log_error "${name}" "FAIL (ORDER)" 147 return 1 148 fi 149 rm -f "${tmpfeedfile}.merge" 150 151 # copy 152 if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then 153 log_error "${name}" "FAIL (COPY)" 154 return 1 155 fi 156 rm -f "${tmpfeedfile}.order" 157 158 # OK 159 log "${name}" "OK" 160 return 0 161} 162 163# fetch and process a feed in parallel. 164# feed(name, feedurl, [basesiteurl], [encoding]) 165feed() { 166 # Output job parameters for xargs. 167 # Specify fields as a single parameter separated by a NUL byte. 168 # The parameter is split into fields later by the child process, this 169 # allows using xargs with empty fields across many implementations. 170 printf '%s\037%s\037%s\037%s\037%s\037%s\0' \ 171 "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4" 172} 173 174# cleanup() 175cleanup() { 176 # remove temporary directory with feed files. 177 rm -rf "${sfeedtmpdir}" 178} 179 180# die(statuscode) 181die() { 182 statuscode="${1:-1}" # default: exit 1 183 # cleanup temporary files etc. 184 cleanup 185 exit "${statuscode}" 186} 187 188# sighandler(signo) 189sighandler() { 190 signo="$1" 191 # ignore TERM signal for myself. 192 trap -- "" TERM 193 # kill all running children >:D 194 kill -TERM -$$ 195} 196 197# feeds() 198feeds() { 199 printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2 200 echo "See sfeedrc.example for an example." >&2 201 die 202} 203 204# main(args...) 205main() { 206 # signal number received for parent. 207 signo=0 208 # SIGINT: signal to interrupt parent. 209 trap -- "sighandler 2" "INT" 210 # SIGTERM: signal to terminate parent. 211 trap -- "sighandler 15" "TERM" 212 # load config file. 213 loadconfig "$1" 214 # fetch feeds and store in temporary directory. 215 sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die 216 mkdir -p "${sfeedtmpdir}/feeds" 217 touch "${sfeedtmpdir}/ok" || die 218 # make sure path exists. 219 mkdir -p "${sfeedpath}" 220 221 # print feeds for parallel processing with xargs. 222 feeds > "${sfeedtmpdir}/jobs" || die 223 SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \ 224 "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs" 225 statuscode=$? 226 227 # check error exit status indicator for parallel jobs. 228 [ -f "${sfeedtmpdir}/ok" ] || statuscode=1 229 # on signal SIGINT and SIGTERM exit with signal number + 128. 230 [ ${signo} -ne 0 ] && die $((signo+128)) 231 die ${statuscode} 232} 233 234# process a single feed. 235# parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding 236if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then 237 IFS="" # "\037" 238 [ "$1" = "" ] && exit 0 # must have an argument set 239 printf '%s\n' "$1" | \ 240 while read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do 241 loadconfig "${_config}" 242 sfeedtmpdir="${_tmpdir}" 243 _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}" 244 exit "$?" 245 done 246 exit 0 247fi 248 249# ...else parent mode: 250argv0="$0" # store $0, in the zsh shell $0 is the name of the function. 251[ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"