webshot

Snapshot and fingerprint a web resource
git clone git://git.defalsify.org/webshot.git
Info | Log | Files | Refs

webshot.sh (1127B)


      1 #!/bin/bash
      2 
      3 # possible regex for title 
      4 # grep -e "<title>" | sed -e "s/^.*<title>\([^<]*\)<\/title>/\\1/g
      5 # should also convert xml entities, eg. &#8211 -> \u2013 (int -> hex) and render
      6 
      7 f=${WEBSHOT_OUTPUT_DIR:-/tmp}
      8 title_parser=${WEBSHOT_TITLE_PARSER} # script that takes contents.txt as input and outputs a single utf8 string
      9 title=$2
     10 >&2 echo using outdir $f
     11 
     12 set +e
     13 
     14 # prepare 
     15 d=`TZ=UTC date +%Y%m%d%H%M`
     16 t=`mktemp -d`
     17 pushd $t
     18 
     19 # store raw outputs
     20 echo $1 > url.txt
     21 curl -s -I $1 > headers.txt
     22 curl -s -X GET $1 > contents.txt
     23 sha256sum contents.txt > contents.txt.sha256
     24 
     25 # determine title to use and store it, too
     26 #TODO insert title name protection for mkdir
     27 if [ -z "$title" ]; then
     28 	if [ ! -z "$title_parser" ]; then
     29 		title=`$title_parser contents.txt`
     30 	fi
     31 fi
     32 
     33 if [ ! -z "$title" ]; then
     34 	echo $title > title.txt
     35 	>&2 echo using title $title
     36 else
     37 	>&2 echo empty title!
     38 fi
     39 
     40 
     41 # rendered snapshot
     42 h=`cat contents.txt.sha256 | awk '{ print $1; }'`
     43 chromium --headless --print-to-pdf $1
     44 n=${d}_${h}
     45 mv output.pdf $n.pdf
     46 
     47 # store result
     48 mkdir -p "$f/$title"
     49 tar -zcvf "$f/$title/$n.tar.gz" *
     50 
     51 # clean up
     52 popd
     53 
     54 set -e