webshot

Snapshot and fingerprint a web resource
git clone git://git.defalsify.org/webshot.git
Log | Files | Refs

commit 16af102a09be1d0efeb1725dda153d0c7a671d05
Author: lash <dev@holbrook.no>
Date:   Tue, 11 Jan 2022 13:09:07 +0000

Initial commit

Diffstat:
Awebshot.sh | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+), 0 deletions(-)

diff --git a/webshot.sh b/webshot.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# possible regex for title +# grep -e "<title>" | sed -e "s/^.*<title>\([^<]*\)<\/title>/\\1/g +# should also convert xml entities, eg. &#8211 -> \u2013 (int -> hex) and render + +f=${WEBSHOT_OUTPUT_DIR:-/tmp} +title_parser=${WEBSHOT_TITLE_PARSER} # script that takes contents.txt as input and outputs a single utf8 string +title=$2 +>&2 echo using outdir $f + +set +e + +# prepare +d=`TZ=UTC date +%Y%m%d%H%M` +t=`mktemp -d` +pushd $t + +# store raw outputs +echo $1 > url.txt +curl -s -I $1 > headers.txt +curl -s -X GET $1 > contents.txt +sha256sum contents.txt > contents.txt.sha256 + +# determine title to use and store it, too +#TODO insert title name protection for mkdir +if [ -z "$title" ]; then + if [ ! -z "$title_parser" ]; then + title=`$title_parser contents.txt` + fi +fi + +if [ ! -z "$title" ]; then + echo $title > title.txt + >&2 echo using title $title +else + >&2 echo empty title! +fi + + +# rendered snapshot +h=`cat contents.txt.sha256 | awk '{ print $1; }'` +chromium --headless --print-to-pdf $1 +n=${d}_${h} +mv output.pdf $n.pdf + +# store result +mkdir -p "$f/$title" +tar -zcvf "$f/$title/$n.tar.gz" * + +# clean up +popd + +set -e