#!/bin/bash
# cens_dl1.sh: A simple script for downloading some urls, listed in a file ($1),
# with immediate deletion after the download.
# Example: ./cens_dl1.sh list1.txt
# Usefull to load pages into a proxy, to test censorship, to fake site hits etc..
# Without a proxy and in a country with strong censorship it can be used as
# replacement for an emergency call.
# Todo: Integration of IP spoofing, e. g. for an emergency call to someone else.
# Version 0.5, 2009-05-19
# ----------------------------------------------------------------------------
# "THE BEERWARE LICENSE" (Revision 44):
# Dr. Rolf Freitag (rolf dot freitag at email dot de) wrote this file.
# As long as you retain this notice you can do whatever
# the GPL (GNU Public License version 3) allows with this stuff.
# If you think this stuff is worth it, you can send me money via
# paypal, and get a contribution receipt if you wish, or if we met some day
# you can buy me a beer in return ;-)
# ----------------------------------------------------------------------------

# uncomment the next line for debugging or simply see the script working
#set -x

# To use or not to use Proxy (on or off)
PROXY="on"

if [ "$#" -ne 1 ]; then
  echo "Error: Not one parameter (the input file); exiting!"
  exit -1
fi

# define and make the working directory which must be a simple subdirectory
WORKDIR="download"
mkdir -p "$WORKDIR"

# Set the proxies, e. g. a local squid with a parent proxy at your internet
# access provider or a local JAP or Tor proxy or simply a proxy
# at your internet access provider.
export  http_proxy="192.168.59.9:3128"
export https_proxy="192.168.59.9:3128"
export   ftp_proxy="192.168.59.9:3128"

# user agent version for wget (for faking a browser/bot)
#USER_AGENT=Mozilla/5.0\ \(compatible\;\ Googlebot/2.2\;\ +http://www.google.com/bot.html\)
#USER_AGENT=Mozilla/4.0\ \(compatible\;\ MSIE\ 7.0\;\ Windows\ NT\ 6.0\;\ SLCC1\;\ .NET\ CLR\ 2.0.50727\;\ .NET\ CLR\ 3.0.04506\)
USER_AGENT=Mozilla/4.0\ \(compatible\;\ MSIE\ 6.0\;\ Windows\ NT\ 5.1\;\ SV1\;\ .NET\ CLR\ 1.1.4322\)
#USER_AGENT=Mozilla/5.0\ \(compatible\;\ MSIE\ 7.0\;\ Windows\ NT\ 6.0\;\ MSIECrawler\)

# Referer for wget
#REFERER="http://www.bundesregierung.de"
REFERER=""

# other wget option: Limits and other stuff
OTHER_WGET_OPTIONS="-v -np --cache=off --delete-after -T 5 --quota=2345M --tries=2 --restrict-file-names=unix \
-nd --ignore-length --no-passive-ftp --random-wait --wait=3 --limit-rate=1234k --retry-connrefused"

# temporary file name
TMPFILE="tmpfile.$1"

# unsort (shuffle), see e. g. http://packages.debian.org/de/lenny/unsort
#unsort "$1" > "$TMPFILE"
# unsort the input file, see http://wooledge.org:8000/BashFAQ/026, but this version is
# without a useless use of cat and therefore three times faster because of 2 fewer processes per loop.
while read l ; do printf "0$RANDOM\t%s\n" "$l"; done<"$1" | sort -n | cut -f2- > "$TMPFILE"

# Download in random order with a read loop because the -i option of wget (1.11.4) does not work.
# Todo: A time limit for every single loop 
cd "$WORKDIR"
while read line; do
 wget --proxy="$PROXY" --user-agent="$USER_AGENT" $OTHER_WGET_OPTIONS --referer="$REFERER" "$line"
 # Delete trash from bugs of wget
 find . -type f  -exec rm -- {} \;
done<../"$TMPFILE"

echo $PWD

# go back, make clean
cd ..
rm -f -- "$TMPFILE"
# rm -rf -- "$WORKDIR"

exit 0

