webshot.sh (1127B)
1 #!/bin/bash 2 3 # possible regex for title 4 # grep -e "<title>" | sed -e "s/^.*<title>\([^<]*\)<\/title>/\\1/g 5 # should also convert xml entities, eg. – -> \u2013 (int -> hex) and render 6 7 f=${WEBSHOT_OUTPUT_DIR:-/tmp} 8 title_parser=${WEBSHOT_TITLE_PARSER} # script that takes contents.txt as input and outputs a single utf8 string 9 title=$2 10 >&2 echo using outdir $f 11 12 set +e 13 14 # prepare 15 d=`TZ=UTC date +%Y%m%d%H%M` 16 t=`mktemp -d` 17 pushd $t 18 19 # store raw outputs 20 echo $1 > url.txt 21 curl -s -I $1 > headers.txt 22 curl -s -X GET $1 > contents.txt 23 sha256sum contents.txt > contents.txt.sha256 24 25 # determine title to use and store it, too 26 #TODO insert title name protection for mkdir 27 if [ -z "$title" ]; then 28 if [ ! -z "$title_parser" ]; then 29 title=`$title_parser contents.txt` 30 fi 31 fi 32 33 if [ ! -z "$title" ]; then 34 echo $title > title.txt 35 >&2 echo using title $title 36 else 37 >&2 echo empty title! 38 fi 39 40 41 # rendered snapshot 42 h=`cat contents.txt.sha256 | awk '{ print $1; }'` 43 chromium --headless --print-to-pdf $1 44 n=${d}_${h} 45 mv output.pdf $n.pdf 46 47 # store result 48 mkdir -p "$f/$title" 49 tar -zcvf "$f/$title/$n.tar.gz" * 50 51 # clean up 52 popd 53 54 set -e