#!/bin/bash
# cens_dns_check1.sh: A simple script for comparing DNS server answers
# to DNS lookup of maybe censored domains, listed in the list $1.
# Usefull to get a big part of the censorship blacklist on the DNS server.
# TODO: IPV6 support, DNSSEC support.
# Version 0.99, 2010-05-22
# ----------------------------------------------------------------------------
# "THE BEERWARE LICENSE" (Revision 44):
# Dr. Rolf Freitag (rolf dot freitag at email dot de) wrote this file.
# As long as you retain this notice you can do whatever
# the GPL (GNU Public License version 3) allows with this stuff.
# If you think this stuff is worth it, you can send me money via
# paypal, and get a contribution receipt if you wish, or if we met some day
# you can buy me a beer in return.
# ----------------------------------------------------------------------------

# Exit when a variable isn't set. Add option -x for verbosity (tracing/debug 
# mode) and/or -e for exit when any statement returns a non-true return value.
set -u 


if [ "$#" -ne 1 ]; then
  echo "Error: Not one parameter (the input file); exiting!"
  exit -1
fi

# temporary file name (for unsorted Domains)
TMPFILE="tmpfile_dns.$1"

# The firest DNS server should be a not censored DNS server, e. g. the famous 
# 8.8.8.8 from Google or someone from 
# http://www.ccc.de/censorship/dns-howto/?language=de
# or
# http://server.privacyfoundation.de/
# or
# http://www.foebud.org/aboutus/gegen-internetsperren-in-einer-freien-gesellschaft-foebud-richtet-anti-zensur-dns-server-ein/
# It's also a good idea to use a server which is NOT at the standard port (53),
# e. g. 62.141.58.13, 85.25.141.60, 85.25.251.254, 94.75.228.29 or 
# 209.59.210.167 at 110.
# But you should check BOTH servers if they work, before you start the script,
# e. g. with
# dig @209.59.210.167 -p 110 google.com +short
DNS_SERVER0="85.25.251.254"
DNS_SERVER0_PORT="110"

# the maybe censored DNS server (usually a DNS of your provider)
DNS_SERVER1="145.253.2.11"
DNS_SERVER1_PORT="53"

# define the working directory which must be a simple subdirectory
WORKDIR="dns"
# make clean
rm -rf "$WORKDIR"
#mkdir -p "$WORKDIR"

# (main part of the) sleep time between two DNS queries: 0.1 s is ok for a fast
# PC and server (and connection), 0.4 s for a slow PC or server
SLEEP_TIME="0.4"

# make working subdirectories: one for every DNS server
DNS0WORKDIR="$DNS_SERVER0"
mkdir -p "$WORKDIR"/"$DNS0WORKDIR"
DNS1WORKDIR="$DNS_SERVER1"
mkdir -p "$WORKDIR"/"$DNS1WORKDIR"

# unsort (shuffle), see http://wooledge.org:8000/BashFAQ/026, but this version is
# without a useless use of cat and therefore three times faster because of 2 fewer processes per loop.
while read l ; do printf "0$RANDOM\t%s\n" "$l"; done<"$1" | sort -n -r | cut -f2- > "$TMPFILE"

# DNS lookup for the maybe censored URLs to a provider DNS server and an uncensored DNS server.
# One small file for every lookup, one minute timeout (dig default: 5 s).
cd "$WORKDIR"
while read line; do
  dig @"$DNS_SERVER0" -p "$DNS_SERVER0_PORT" +time=60 -4 +short +noidentify "$line" > "$DNS0WORKDIR"/"$line" &
  # add a little random jitter
  VAR=0.00$((($RANDOM+1)/3))
  sleep $VAR
  dig @"$DNS_SERVER1" -p "$DNS_SERVER1_PORT" +time=60 -4 +short +noidentify "$line" > "$DNS1WORKDIR"/"$line" &
  sleep "$SLEEP_TIME"
done<../"$TMPFILE"

# Wait for maybe pending lookups (up to 1200 with a timeout of 60 s and a delay of 100 ms).
sleep 60

# tmpfile
TMPFILE_BLACKLISTING="tmpfile_blacklist.txt"

# Normalisation of the dig outputs (sorted, no equal lines).
while read line; do
  sort "$DNS0WORKDIR"/"$line" >  "$TMPFILE_BLACKLISTING"
  uniq "$TMPFILE_BLACKLISTING" > "$DNS0WORKDIR"/"$line"
  sort "$DNS1WORKDIR"/"$line" >  "$TMPFILE_BLACKLISTING"
  uniq "$TMPFILE_BLACKLISTING" > "$DNS1WORKDIR"/"$line"
done<../"$TMPFILE"

# Make a backup for rDNS via searching the files and other stuff.
# This (nearly) doubles the disk usage.
mkdir backup
cp -a "$DNS0WORKDIR" backup/ &
cp -a "$DNS1WORKDIR" backup/ &

# One big logfile for every DNS server. Write the IPs in the order of the input file.
# > "$DNS_SERVER0".txt
# > "$DNS_SERVER1".txt
while read line; do
  cat "$DNS0WORKDIR"/"$line" >> "$DNS_SERVER0".txt
  cat "$DNS1WORKDIR"/"$line" >> "$DNS_SERVER1".txt
done<../"$TMPFILE"

# Filter the results: Start by deleting equal answers from the different DNS servers.
BLACKLIST0="minimal_filtered_blacklist.txt"
> "$BLACKLIST0"
while read line; do
 # diff: assume text files, ignore whitespace diffs, ignore empty lines
 diff -a -b -B "$DNS0WORKDIR"/"$line" "$DNS1WORKDIR"/"$line" &> /dev/null
 if [ "$?" -ne 0 ]; then
   echo "$line" >> "$BLACKLIST0"
 else
   rm -- "$DNS0WORKDIR"/"$line" &
   rm -- "$DNS1WORKDIR"/"$line" &
 fi
done<../"$TMPFILE"

# For further filtering: Replace the shuffled domain list by the unsorted blacklist.
cp -f "$BLACKLIST0" ../"$TMPFILE"

# sort the blacklist
sort "$BLACKLIST0" > "$TMPFILE_BLACKLISTING"
uniq "$TMPFILE_BLACKLISTING" > "$BLACKLIST0"

# Filter similar IPs by comparing only the first two Bytes, write results to "$BLACKLIST1".
while read line; do
 > "$TMPFILE_BLACKLISTING"
 cat "$DNS0WORKDIR"/"$line" | while read line; do echo ${line%.*.*} >> "$TMPFILE_BLACKLISTING"; done
 uniq "$TMPFILE_BLACKLISTING" > "$DNS0WORKDIR"/"$line"
 > "$TMPFILE_BLACKLISTING"
 cat "$DNS1WORKDIR"/"$line" | while read line; do echo ${line%.*.*} >> "$TMPFILE_BLACKLISTING"; done
 uniq "$TMPFILE_BLACKLISTING" > "$DNS1WORKDIR"/"$line"
done<../"$TMPFILE"

# Remove the no connection messages ";; connection timed out; no servers could be reached".
# Todo: Check if the second argument for cp is not empty.
find "$DNS0WORKDIR"/ "$DNS1WORKDIR"/ -type f -size 53c -print0 | xargs -0 grep -l ";; connection timed out; no servers could be reached" | xargs -n 1 cp /dev/null

BLACKLIST1="medium_filtered_blacklist.txt"
> "$BLACKLIST1"
while read line; do
 diff -a -b -B "$DNS0WORKDIR"/"$line" "$DNS1WORKDIR"/"$line" &> /dev/null
 if [ "$?" -ne 0 ]; then
   echo "$line" >> "$BLACKLIST1"
 else
   rm -- "$DNS0WORKDIR"/"$line" &
   rm -- "$DNS1WORKDIR"/"$line" &
 fi
done<../"$TMPFILE"

# For further filtering: Replace the shuffled domain list by the unsorted blacklist.
cp -f "$BLACKLIST1" ../"$TMPFILE"

# sort the blacklist
sort "$BLACKLIST1" > "$TMPFILE_BLACKLISTING"
uniq "$TMPFILE_BLACKLISTING" > "$BLACKLIST1"


# Filter otherwise similar DNS server answers, write results to "$BLACKLIST2"


# TODO: If existing, compare the canonical names, not the IPs.
# If one of the canonical names is returned by both severs, remove the
# domain from the blacklist.


# TODO: Get the default IP answer from the uncensored DNS server and filter 
# these entries. (lookup for invalid domains like invalid.test does not work;
# a guarantedd valid but not existing domain name is needed)


# Compare only the first Byte of the IPs, write results to "$BLACKLIST2".
while read line; do
 > "$TMPFILE_BLACKLISTING"
 cat "$DNS0WORKDIR"/"$line" | while read line; do echo ${line%.*} >> "$TMPFILE_BLACKLISTING"; done
 uniq "$TMPFILE_BLACKLISTING" > "$DNS0WORKDIR"/"$line"
 > "$TMPFILE_BLACKLISTING"
 cat "$DNS1WORKDIR"/"$line" | while read line; do echo ${line%.*} >> "$TMPFILE_BLACKLISTING"; done
 uniq "$TMPFILE_BLACKLISTING" > "$DNS1WORKDIR"/"$line"
done<../"$TMPFILE"

BLACKLIST2="good_filtered_blacklist.txt"
> "$BLACKLIST2"
while read line; do
 diff -a -b -B "$DNS0WORKDIR"/"$line" "$DNS1WORKDIR"/"$line" &> /dev/null
 if [ "$?" -ne 0 ]; then
   echo "$line" >> "$BLACKLIST2"
 else
   rm -- "$DNS0WORKDIR"/"$line" &
   rm -- "$DNS1WORKDIR"/"$line" &
 fi
done<../"$TMPFILE"

# sort the blacklist
sort "$BLACKLIST2" > "$TMPFILE_BLACKLISTING"
uniq "$TMPFILE_BLACKLISTING" > "$BLACKLIST2"

#make clean
rm -- "$TMPFILE_BLACKLISTING" &
cd ..
rm -- "$TMPFILE" &

sync &

exit 0

