sfeed

Simple RSS and Atom feed parser
git clone https://git.sinitax.com/codemadness/sfeed
Log | Files | Refs | README | LICENSE | Upstream | sfeed.txt

commit 22af8f5c3a9f79f28cf2c56d9b244804a70ddcc7
parent 57a90ba638f38fa589119dd6dc6a23482c58bfda
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 14 Apr 2019 15:00:19 +0200

sfeed_update improvements

- Better checking and verbose logging (on failure) of each stage:
  fetchfeed, filter, merge, order, convertencoding. This makes sure on out-of-memory,
  disk-space or other resource limits the output is not corrupted.
  - This also has the added advantage it runs less processes (piped) at the same
    time.
  - Clear previous unneeded file to preserve space in /tmp
    (/tmp is often mounted as mfs/tmpfs).
- Add logging function (able to override), use more logical logging format (pun
  intended).
- Code-style: order overridable functions in execution order.

Diffstat:
Msfeed_update | 123+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 80 insertions(+), 43 deletions(-)

diff --git a/sfeed_update b/sfeed_update @@ -31,6 +31,20 @@ loadconfig() { fi } +# log(name,s) +log() { + printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2 +} + +# fetch a feed via HTTP/HTTPS etc. +# fetchfeed(name, url, feedfile) +fetchfeed() { + # fail on redirects,, hide User-Agent, timeout is 15 seconds, + # -z for If-Modified-Since. + curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ + -z "$3" "$2" 2>/dev/null +} + # convert encoding from one encoding to another. # convertencoding(from, to) convertencoding() { @@ -42,12 +56,6 @@ convertencoding() { fi } -# merge raw files: unique sort by id, title, link. -# merge(name, oldfile, newfile) -merge() { - sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null -} - # filter fields. # filter(name) filter() { @@ -60,15 +68,10 @@ order() { sort -t ' ' -k1rn,1 } -# fetch a feed via HTTP/HTTPS etc. -# fetchfeed(name, url, feedfile) -fetchfeed() { - if curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ - -z "$3" "$2" 2>/dev/null; then - printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 - else - printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 - fi +# merge raw files: unique sort by id, title, link. +# merge(name, oldfile, newfile) +merge() { + sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$2" "$3" 2>/dev/null } # fetch and parse feed. @@ -85,37 +88,71 @@ feed() { filename="$(printf '%s' "$1" | tr '/' '_')" feedurl="$2" basesiteurl="$3" - tmpfeedfile="${sfeedtmpdir}/${filename}" - tmpencfile="" encoding="$4" + sfeedfile="${sfeedpath}/${filename}" + tmpfeedfile="${sfeedtmpdir}/${filename}" + + if ! fetchfeed "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then + log "${name}" "FAIL (FETCH)" + return + fi + + # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8. + [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch") + + if ! convertencoding "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then + log "${name}" "FAIL (ENCODING)" + return + fi + rm -f "${tmpfeedfile}.fetch" + + if ! sfeed "${basesiteurl}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then + log "${name}" "FAIL (CONVERT)" + return + fi + rm -f "${tmpfeedfile}.enc" + + if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then + log "${name}" "FAIL (FILTER)" + return + fi + rm -f "${tmpfeedfile}.tsv" + + # new feed data is empty: no need for below stages. + if [ ! -s "${tmpfeedfile}.filter" ]; then + log "${name}" "OK" + return + fi + + # if file does not exist yet "merge" with /dev/null. + if [ -e "${sfeedfile}" ]; then + oldfile="${sfeedfile}" + else + oldfile="/dev/null" + fi + + if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then + log "${name}" "FAIL (MERGE)" + return + fi + rm -f "${tmpfeedfile}.filter" + + if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then + log "${name}" "FAIL (ORDER)" + return + fi + rm -f "${tmpfeedfile}.merge" + + # atomic move. + if ! mv "${tmpfeedfile}.order" "${sfeedfile}"; then + log "${name}" "FAIL (MOVE)" + return + fi - if [ "${encoding}" != "" ]; then - fetchfeed "${name}" "${feedurl}" "${sfeedfile}" | \ - convertencoding "${encoding}" "utf-8" - else # detect encoding. - tmpencfile="${tmpfeedfile}.enc" - fetchfeed "${name}" "${feedurl}" "${sfeedfile}" > "${tmpencfile}" - detectenc=$(sfeed_xmlenc < "${tmpencfile}") - convertencoding "${detectenc}" "utf-8" < "${tmpencfile}" - fi | sfeed "${basesiteurl}" | filter "${name}" > "${tmpfeedfile}" - - # get new data and merge with old. - sfeedfilenew="${sfeedpath}/${filename}.new" - # new feed data is non-empty. - if [ -s "${tmpfeedfile}" ]; then - # if file exists, merge - if [ -e "${sfeedfile}" ]; then - merge "${name}" "${sfeedfile}" "${tmpfeedfile}" | \ - order "${name}" > "${sfeedfilenew}" - - # overwrite old file with updated file - mv "${sfeedfilenew}" "${sfeedfile}" - else - merge "${name}" "/dev/null" "${tmpfeedfile}" | \ - order "${name}" > "${sfeedfile}" - fi - fi) & + # OK + log "${name}" "OK" + ) & } cleanup() {