#! /usr/bin/env bash # Vim :set tabstop=2 shiftwidth=2 expandtab # ABOUT THIS SCRIPT # ============================================================================== # This script lists a count for IP addresses accessing specific resources in # Apache access logs. It optionally blocks IP addresses with too many hits. script_name=blockbot script_version=0.8.1 # TWEAKABLE VARIABLES # ============================================================================== # You can modify the behaviour of this script by changing the below variables. # There shouldn't be anything else you need to change. # [$directory] # The script looks for access logs in $directory. Note that paths containing # globs need to be in an array. The variable is passed to a `find` command, # and to preserve any globs we can call "${directory[@]}" in the command. directory=(/usr/local/apache/domlogs/*/*) # [$resources] # An array containing the resource(s) the script should grep. resources=( "wp-login.php" "xmlrpc.php" ) # [$no_http_method] # A true/false boolean to define whether or not the script should grep logs # for specific HTTP methods, such as GET and POST. Unless this option is set # to false it greps for the HTTP methods defined in $http_methods. no_http_method=false # [$http_method] # An array containing HTTP request methods. By default the array contains # only the value POST. http_method=( "POST" ) # [$since] # This variable is used twice: it's used in the command that finds files in # $directory (it limites it to file modified in the last $since minutes) and # in the log_munches function (match entries in the last $since seconds). since=15 # [$min_hits] # The minimum number of hits that trigger an IP block. min_hits=10 # [$min_hits_min] # The minimum value for $min_hits. This prevents users (accidentally) set # $min_hits to a very low value. min_hits_min=5 # [$tailcount] # An integer used to tail access logs before they are processed. This helps # prevent we're processing huge logs. tailcount=1000 # [$no_block] # A true/false boolean to define if the script should block IPs. If set to # 'true' the script will only report on hits from IPs. no_block=false # [$outdir] # The output directory for the script's output outdir=/tmp/$script_name # [$tmpdir] # The directory for temporary files. tmpdir=/tmp/$script_name # HOUSE-KEEPING # ============================================================================== # Get the current time Unix time. unixtime=$(date +%s) # Set a variable used to add a timestamp to the output directory. filetime=$(date -d @"$unixtime" +"%Y%m%d%H%M%S") # It's very important to have a pretty time for the header. prettytime=$(date -d @"$unixtime" +"%d/%m/%Y %H:%M:%S") # Store the directory and prefix for temporary/output files. outlog="$tmpdir"/"$filetime" tmplog="$tmpdir"/"$filetime" # GENERAL FUNCTIONS # ============================================================================== # Function to join elements of an array. join_array() { local IFS="$1"; shift ; echo "$*"; } # function to check if the script is run by root. It should be run by root, as # the script needs to be able to check logs and block IPs. root_check() { if [ "$(id -u)" != 0 ]; then printf '%s\n' "Error: only root may run this script!" exit 0 fi } # This function does the bulk of the work: # * It converts Apache timestamps to Unix timestamps. # * It finds entries newer then $since_unixtime. # * It matches custom HTTP methods (such as POST) and resources (such as # wp-login.php and xmlrpc.php). # # Matched entries are written to log_matches.tmp # # To remind my future self how this magic is performed, we're first # splitting lines by square brackets. This gives us three parts: # * $1 [$ip_part] The IP address, identity and user fields # * $2 [$ts_part] The timestamp # * $3 [$ms_part] The rest of the line # # All the fields are chopped up into arrays: # * To get the IP address we need to get the first field of $1 and # assign it to a variable ($col_ip) # * The timestamp needs to be converted to a format mktime can work with: # - Apache format: [dd/MMM/YYYY:HH:MM:SS TZ] # - mktime format: YYYY MM DD HH MM SS TZ # Every part of the original string is mangled and eventually stored in # a variable named $col_unixtime. # * We need the first two fields of $3: the HTTP method and the resource # that was requested. The field is first split by quotes and we then # store the first two fields in variables: $col_method and $col_resource. # # The function then checks if $unix_time is greater than $since_unixtime, # and if so check what HTTP methods and resources need to be matches. The # regexes for this are: # * $method_regex matches HTTP methods, such as "GET|POST" # * $resources_regex matches resources, such as "wp-login.php|xmlrpc.php" # # Both regexes are pre-mangled and passed to Awk as variables. The HTTP # method check is optional, so we're also telling Awk whether or not it # should look for specific values in $col_method. log_muncher() { awk \ -v since_unixtime="$since_unixtime" \ -v resources_regex="$resources_regex" \ -v no_http_method="$no_http_method" \ -v method_regex="$method_regex" \ -F'[][]' '{ # Assign the three fields to variables: ip_part=$1 ts_part=$2 ms_part=$3 # Get the IP address in a variable. The IP is the first field of ip_part: split(ip_part, ip_var, / /) ip = ip_var[1] # Get ts_part in variables for the date, time and timezone: ts_date = substr(ts_part,1,11) ts_time = substr(ts_part,13,8) ts_zone = substr(ts_part,22,5) # Split ts_date in variables for the year, month and date. The month (MMM) # needs to be converted to a number: split(ts_date, tsd, /\//) ts_year = tsd[3] ts_mon = sprintf("%02d",(index("JanFebMarAprMayJunJulAugSepOctNovDec",tsd[2])+2)/3) ts_day = tsd[1] # Split ts_time in variables for hours, minutes and seconds: split(ts_time, tst, /:/) ts_hour = tst[1] ts_min = tst[2] ts_sec = tst[3] # Concatenate all the timestamp parts so that we can feed it mktime(): ts_concat = sprintf("%04d %02d %02d %02d %02d %02d %s", ts_year, ts_mon, ts_day, ts_hour, ts_min, ts_sec, ts_zone) # Make the time and store it in a variable: unix_time = mktime(ts_concat) # Get the HTTP method and requested resource in variables. We can first # split ms_part by quotes and then split the second field by spaces: split(ms_part, i, /"/) unquoted_string = i[2] split(unquoted_string, j, / /) http_meth = j[1] resource = j[2] # Print the line with the converted timestamp if (unix_time > since_unixtime) if (no_http_method == "false") if ( (http_meth ~ method_regex) && (resource ~ resources_regex) ) { printf "%s\t%s\t%s\t%s\n", ip, unix_time, http_meth, resource } else { } else if (resource ~ resources_regex) { printf "%s\t%s\t%s\t%s\n", ip, unix_time, http_meth, resource } }' > "$tmplog"/log_matches.tmp } # FUNCTIONS THAT PRINT STUFF # ============================================================================== # Function for printing the help text function print_help { cat <<- HELP Help for $script_name $script_version ------------------------------------------------------------------------ $script_name counts IP addresses that have requested specific resources in one or more Apache access logs. It returns the number of hits per IP, and the script can block the IP addresses. If run without any arguments the script will use sensible defaults but there are a number of arguments you can change on the fly: --directory=[dir] A directory containing logs that should checked. --resources=["file1, file2"] The resources that should be checked for hits. --http-method=["method, method"] Limit the search to specific resource methods, such as GET and POST. --no-http-method Ignore and HTTP methods defined by --http-method. --min-hits=[n] Ignore IPs with fewer than [n] hits on the specified resources. This value has to be greater than $min_hits_min. --since=[n] Only check entries since the last [n] minutes. --tail=[n] Tail access logs to limit the amount of processing. --no-block Don't block IP addresses (only report on the number of hits). --outdir=[directory] The output directory for the output file. HELP exit 0 } # FUNCTIONS THAT VALIDATE ARGUMENTS # ---------------------------------------------------------------------- validate_tmpdir() { # Make sure $tmpdir exists or can be created. if ! [ -d "$tmpdir" ]; then if ! mkdir -p "$tmpdir"; then printf '%s\n' "Error: unable to create temporary directory" printf '%s\n' "Directory: $tmpdir" exit 1 fi fi # Make sure $tmplog exists or can be created. if ! [ -d "$tmplog" ]; then if ! mkdir -p "$tmplog"; then printf '%s\n' "Error: unable to create the templog directory" printf '%s\n' "Directory: $tmplog" exit 1 fi fi } validate_outdir() { if ! [ -d "$outdir" ]; then if ! mkdir -p "$outdir"; then printf '%s\n' "Error: unable to create output directory ($outdir)" print_help fi fi } validate_directory() { if ! [ "${#directory[@]}" -ge 1 ]; then printf '%s\n' "Error: either no directory or more than one directory specified." print_help fi } validate_resources() { if [ ${#resources[@]} -eq 0 ]; then printf '%s\n' "Error: no log resources defined!" print_help else # Get the resources in a string that can be used by grep. We first joing # the elements in the array and then replace spaces with pipes. resources_string=$(join_array ' ' "${resources[@]}") resources_regex="${resources_string// /$'|'}" fi } validate_no_http_method() { if [ -z "$no_http_method" ]; then no_http_method=false fi } validate_http_method() { # We only need to check the array if $no_http_method is false if [ "$no_http_method" = false ]; then # Make sure the array contains items if [ ${#http_method[@]} -eq 0 ]; then printf '%s\n' "Error: no HTTP method defined!" print_help else # Get the methods in a string that can be used by grep. We first join # the elements in the array and then replace spaces with pipes. method_string=$(join_array ' ' "${http_method[@]}") method_regex="${method_string// /$'|'}" fi fi } # We can only block IPs if we got a tool to do so. If we should be blocking # IPs we'll use csf if possible, and iptables otherwise. If neither utility # is available then we can't block IPs. validate_no_block() { if [ "$no_block" = false ]; then if command -v csf &> /dev/null; then block_tool=csf elif command -v iptables &> /dev/null; then block_tool=iptables else no_block=true fi else no_block=true fi } validate_since() { if ! [ "$since" -ge 1 ]; then printf '%s\n' "Error: \$since has to be numeric." print_help fi # Convert $since to seconds and subtract it from the current Unix time. # This is used in the Awk command that finds recent log entries in # a user's access log. since_secs=$(( since * 60 )) since_unixtime=$(( unixtime - since_secs)) # Store a human readable start time: since_pretty=$(date -d @"$since_unixtime" +"%d/%m/%Y %H:%M:%S") } # $min_hits has to be numeric and greater then $min_hits_min. validate_min_hits() { if ! [ "$min_hits" -ge 1 ]; then printf '%s\n' "Error: \$min-hits has to be numeric." print_help fi if [ "$min_hits" -lt "$min_hits_min" ]; then printf '%s\n' "Error: \$min-hits is set too low (minimum value is $min_hits_min)" print_help fi } validate_tailcount() { if ! [ "$tailcount" -ge 1 ]; then printf '%s\n' "Error: \$tailcount has to be numeric." print_help fi } print_header() { # Print a pretty header. pad=$(printf '%0.1s' "-"{1..80}) padlength=77 printf '%s' "$script_name $script_version " printf '%*.*s' 0 $((padlength - (${#script_name} + ${#script_version}) - ${#prettytime} )) "$pad" printf '%s\n' " $prettytime" # Store the date and script version in the meta file. { printf '%s\n' " Date: $prettytime" printf '%s\n' " Version: $script_version" } > "$outlog"/meta # Print the job details and append them to the mega file. { printf '\n%s\n' " From time: $since_pretty ($since_unixtime, $since minutes ago)" #printf '%s\n' " Directory: $directory" printf '%s\n' " Resource(s): $resources_string" printf '%s\n' " No. hits: $min_hits" if [ "$no_http_method" = false ]; then printf '%s\n' " Method(s): $method_string" else printf '%s\n' " Method(s): None (i.e. GET, POST etc.)" fi if [ "$no_block" = true ]; then printf '%s\n' " Blocking: No (report only)" else printf '%s\n' " Blocking: Yes (using $block_tool)" fi if [ -z "$check_load" ]; then if [ "$check_load" = true ]; then printf '%s\n' " Check load: Yes (deprecated dummy function)" fi fi printf '%s\n\n' " Output: $outlog/" } | tee -a "$outlog"/meta } # FUNCTIONS TO PROCESS LOGS # ---------------------------------------------------------------------- # Function to store logs that need to be processed further in an array # ($logs_array). get_logs() { printf '%s\n' "=> Looking for files that were modified in the last $since minutes" # Initialise the $logs_array. logs_array=() # Store files in the directory an the array. mapfile -t logs_array < <(find "${directory[@]}" -type f -mmin -"$since") # Exit if $logs_array is empty. if [ ${#logs_array[@]} -eq 0 ]; then printf '%s\n' "=> No files have been modified in the last $since minutes" exit 1 fi } # Function that makes sure items in $logs_array are in fact log files # and which double-checks that the file is not empty after tailing the # log. Logs that pass the tests are added to a $log_files array. validate_logs() { # Initialise the array. files_array=() printf '%s\n' "=> Validating ${#logs_array[@]} files..." # Loop through $logs_array. for f in "${logs_array[@]}"; do # Make sure the file looks like an access log. To do so we use awk to # delimit the file by square brackets and checking if the second field # is an Apache timestamp (much easier and more reliable than checking # if lines start with an IPv4 or IPv6 address). regex='^[0-3][0-9]\\/[ADFJMNOS][a-z]{2}\\/20[0-9]{2}:[0-2][0-9]:[0-5][0-9]:[0-5][0-9] [+-][0-1][0-4][0-9]{2}$' check_line=$(head -n 1 "$f" | awk -F'[][]' -v regex="$regex" '$2 ~ regex {print $0}') if [ -z "$check_line" ]; then continue else files_array+=("$f") fi done # Exit if $log_files is empty. if [ ${#files_array[@]} -eq 0 ]; then printf '\n%s\n' "=> No access logs found" exit 0 else printf '%s\n' "=> Found ${#files_array[@]} access logs" fi } # Function to process the logs and store any matches in two files: # * $outlog/all.log contains the matches for all files. We'll use this to # count hits for IPs in all files. # * $outlog/log.$number is a record of the hits found in individual files. # We can use this to show a breakdown of the hits for an IP. # # We also need to keep track of the number of logs files with hits, as we # need to gracefully when there are none. To do so we can simply check if # the file counter is greater than 1 at the end of the loop. munch_logs() { # Counter to create $outlog/log.$number files. This is also used to # check if the function found any files with hits. file_number=1 # Loop through $files_array. for f in "${files_array[@]}"; do # Tail the log and convert the timestamps. tail -n "$tailcount" "$f" | log_muncher # Check if we got the log_matches.tmp file. Note that it's not necessarily # an issue if the file doesn't exist. In all likelihood there were simply # no matches in the log. if ! [ -s "$tmplog"/log_matches.tmp ]; then continue fi # Count the number of matches. count_matches=$(wc -l < "$tmplog"/log_matches.tmp) if [ "$count_matches" -ge 1 ]; then # Append the matches to $outlog/all.log. cat "$tmplog"/log_matches.tmp >> "$outlog"/all.log # Store the file name and append matches in $outlog/log.$file_number. echo "# $f" > "$outlog"/log."$file_number" cat "$tmplog"/log_matches.tmp >> "$outlog"/log."$file_number" fi file_number=$(( file_number + 1 )) done # Remove the log_matches file so that we only got files with relevant data # in $tmplog: rm -f "$tmplog"/log_matches.tmp # Check how many files were found. if [ "$file_number" -gt 1 ]; then printf '%s\n' "=> Found $file_number files with one or more hits" else printf '%s\n' "=> No hits found in any of the following ${#files_array[@]} files:" for f in "${files_array[@]}"; do printf '%s\n' " $f" done exit 0 fi } process_output() { if [ -s "$outlog"/all.log ]; then cut -s -f 1 "$outlog"/all.log \ | sort | uniq -c | sort -nr \ > "$outlog"/all_summary.log awk -v min_hits="$min_hits" ' $1 >= min_hits { printf "%s\t%s\n", $1, $2 } ' < "$outlog"/all_summary.log > "$outlog"/all_summary_matches.log count_matches=$(wc -l < "$outlog"/all_summary_matches.log) if [ "$count_matches" -ge 1 ]; then printf '\n%s\n' "Found $count_matches IPs with $min_hits or more hits:" cat "$outlog"/all_summary_matches.log else printf '%s\n' "=> No IPs with $min_hits or more hits." print_footer fi fi } block_ips() { if [ "$no_block" = true ]; then printf '\n%s\n' "=> Not blocking IPs as \$no_block is set to 'true'." print_footer fi if ! [ -s "$outlog"/all_summary_matches.log ]; then printf '%s\n' "Error: Unable to block IPs as the summary file doesn't exist." printf '%s\n' "File: $outlog/all_summary_matches.log" exit 1 fi # We're going to block IPs. printf '\n%s\n' "Blocking IPs using $block_tool..." # We're storing hits across files for the IP in a "justification file". # The file can be used to check why the IP was blocked. The files are # named $outlog/block.$file_number. As we need a separate file for each # IP we need to start a counter. file_number=1 # And we're also starting a counter to track the number of blocked IPs. ips_blocked=0 # Read $outlog/all_summary_matches.log. while read -r _ ip; do # Print a header. printf '%s\n\n' "# $ip block justification" >> "$outlog"/block."$file_number" # Store matches for the IP. First, we need an array containing log files # in which the IP appeared. mapfile -t file_matches < <(grep -rl "$ip" "$outlog"/log.*) # Loop through the array with files. if [ ${#file_matches[@]} -ge 1 ]; then # Loop through the files and append the matches in $ip_matches to the # block justification file ($outlog/block.$file_number). for match in "${file_matches[@]}"; do { head -1 "$match" grep ^"$ip" "$match" \ | awk '$2=strftime("%Y-%m-%d %H:%M:%S", $2) { printf "%s\t%s\t%s\t%s\n", $1,$2,$3,$4 }' echo "" } >> "$outlog"/block."$file_number" done # Check the blocking tool and try to block the IP. if [ "$block_tool" = csf ]; then if grep -q "$ip" /etc/csf/csf.deny; then printf '%s\n' "$ip is already blocked (skipping)." else csf -d "$ip" Blocked by blockbot ips_blocked=$(( ips_blocked + 1 )) fi else if iptables -L INPUT -v -n | grep -q "$ip"; then printf '%s\n' "$ip is already blocked (skipping)" else if iptables -A INPUT -s "$ip" -j DROP; then printf '%s\n' "Blocked $ip" ips_blocked=$(( ips_blocked + 1 )) else printf '%s\n' "Error while trying to block $ip" fi fi fi else printf '%s\n' "=> Failed to justify block of $ip (skipping)" echo "$ip" >> "$outlog"/failed fi # Increase the counter. file_number=$(( file_number + 1 )) done < "$outlog"/all_summary_matches.log } print_footer() { printf '%s\n\n' "=> Log files stored in $outlog" # Print a pretty footer. We're showing off the number of blocked IPs to the # left and the end time / duration to the right. To start, get the number of # blocked IPs. if [ "$no_block" = true ]; then blocked="blocking disabled" elif [ "$count_matches" -eq 0 ]; then blocked="no matches found" else case "$ips_blocked" in 0) blocked="no IPs blocked" ;; 1) blocked="1 IP blocked" ;; *) blocked="$ips_blocked IPs blocked" ;; esac fi # Get the current time as a unix timestamp. cur_unixtime=$(date +%s) # Calculate the duration in seconds. run_time=$(( cur_unixtime - unixtime )) run_time_msg="runtime: $run_time seconds" # Print the pretty header. pad=$(printf '%0.1s' "-"{1..80}) padlength=77 printf '%s' "$blocked " printf '%*.*s' 0 $(( padlength - ${#blocked} - ${#run_time_msg} )) "$pad" printf '%s\n' " $run_time_msg" # Add the same information to the meta file. { printf '%s\n' " Result: $blocked" printf '%s\n' " Runtime: $run_time seconds" } >> "$outlog"/meta exit 0 } # PARSE THE ARGUMENTS # ---------------------------------------------------------------------- # The script may be run without arguments (in which case it will use # the above variables. while [ "$#" -gt 0 ]; do case "$1" in --directory=*) # Unset the default array: unset directory # Store the custom values: custom_directory="${1#*=}" # Read the custom values into our array: IFS=', ' read -r -a directory <<< "$custom_directory" ;; --resources=*) # Unset the default array: unset resources # Store the custom values: custom_resources="${1#*=}" # Read the custom values into our array: IFS=', ' read -r -a resources <<< "$custom_resources" ;; --no-http-method) no_http_method=true ;; --http-method=*) # Unset the default array: unset http_method custom_methods="${1#*=}" # Read the custom values into the array: IFS=', ' read -r -a http_method <<< "$custom_methods" ;; --since=*) since="${1#*=}" ;; --min-hits=*) min_hits="${1#*=}" ;; --tail=*) tailcount="${1#*=}" ;; --no-block) no_block=true ;; --tmpdir=*) tmpdir="{1#*=}" ;; --outdir=*) outdir="{1#*=}" ;; --check-load) check_load=true ;; --help) print_help ;; *) printf '%s\n\n' "Error: invalid argument ($1)" print_help ;; esac shift done # Check and validate root_check validate_directory validate_tmpdir validate_outdir validate_resources validate_no_http_method validate_http_method validate_no_block validate_since validate_min_hits validate_tailcount # Find and process logs print_header get_logs validate_logs munch_logs process_output block_ips print_footer