Anonymize web logs after rotating and disable explicit download logging in favor of grepping the regular web logs

This commit is contained in:
Alex Cabal 2022-03-16 13:04:52 -04:00
parent 1ea3b2f28b
commit 1e698f2389
4 changed files with 76 additions and 16 deletions

View file

@ -6,7 +6,7 @@ PHP 7+ is required.
```shell ```shell
# Install Apache, PHP, PHP-FPM, and various other dependencies. # Install Apache, PHP, PHP-FPM, and various other dependencies.
sudo apt install -y git composer php-fpm php-cli php-gd php-xml php-apcu php-mbstring php-intl apache2 apache2-utils libfcgi0ldbl task-spooler sudo apt install -y git composer php-fpm php-cli php-gd php-xml php-apcu php-mbstring php-intl apache2 apache2-utils libfcgi0ldbl task-spooler ipv6calc
# Create the site root and logs root and clone this repo into it. # Create the site root and logs root and clone this repo into it.
sudo mkdir /standardebooks.org/ sudo mkdir /standardebooks.org/

View file

@ -60,8 +60,9 @@ Define domain standardebooks.org
DocumentRoot /standardebooks.org/web/www DocumentRoot /standardebooks.org/web/www
ErrorDocument 404 /404 ErrorDocument 404 /404
ErrorLog /var/log/local/www-error.log ErrorLog /var/log/local/www-error.log
DirectorySlash Off
RewriteEngine on RewriteEngine on
CustomLog "|/usr/bin/rotatelogs -f -p /standardebooks.org/scripts/rotate-www-logs /var/log/local/apache/www-access.log 86400" combined CustomLog "|/usr/bin/rotatelogs -f -p /standardebooks.org/web/scripts/rotate-www-logs /var/log/local/apache/www-access.log 86400" combined
SSLEngine on SSLEngine on
SSLCertificateFile /etc/letsencrypt/live/${domain}/fullchain.pem SSLCertificateFile /etc/letsencrypt/live/${domain}/fullchain.pem
@ -69,13 +70,6 @@ Define domain standardebooks.org
Header always set Strict-Transport-Security "max-age=15768000" Header always set Strict-Transport-Security "max-age=15768000"
Header set Content-Security-Policy "default-src 'self';" Header set Content-Security-Policy "default-src 'self';"
# Log downloads
SetEnvIf Request_URI "\.epub$" logdownload
SetEnvIf Request_URI "\.kepub.epub$" logdownload
SetEnvIf Request_URI "\.azw3$" logdownload
CustomLog /var/log/local/downloads.log "%h [%{%Y-%m-%d %H:%M:%S %Z}t] \"%r\" %>s %b" env=logdownload
DirectorySlash Off
<Directory /standardebooks.org/web/www/> <Directory /standardebooks.org/web/www/>
# Disable .htaccess files # Disable .htaccess files
AllowOverride none AllowOverride none

View file

@ -60,6 +60,7 @@ Define domain standardebooks.test
DocumentRoot /standardebooks.org/web/www DocumentRoot /standardebooks.org/web/www
ErrorDocument 404 /404 ErrorDocument 404 /404
ErrorLog /var/log/local/www-error.log ErrorLog /var/log/local/www-error.log
DirectorySlash Off
RewriteEngine on RewriteEngine on
SSLEngine on SSLEngine on
@ -68,13 +69,6 @@ Define domain standardebooks.test
Header always set Strict-Transport-Security "max-age=15768000" Header always set Strict-Transport-Security "max-age=15768000"
Header set Content-Security-Policy "default-src 'self';" Header set Content-Security-Policy "default-src 'self';"
# Log downloads
SetEnvIf Request_URI "\.epub$" logdownload
SetEnvIf Request_URI "\.kepub.epub$" logdownload
SetEnvIf Request_URI "\.azw3$" logdownload
CustomLog /var/log/local/downloads.log "%h [%{%Y-%m-%d %H:%M:%S %Z}t] \"%r\" %>s %b" env=logdownload
DirectorySlash Off
<Directory /standardebooks.org/web/www/> <Directory /standardebooks.org/web/www/>
# Disable .htaccess files # Disable .htaccess files
AllowOverride none AllowOverride none

72
scripts/rotate-www-logs Executable file
View file

@ -0,0 +1,72 @@
#!/bin/bash
usage(){
fmt <<EOF
DESCRIPTION
Moves Apache access log files into a by-month subdirectory, and gzip them.
This script must be run as root, and is generally run by the Apache rotatelogs subprocess as such.
Log files are moved to <LOG-DIR>/apache/YYYY-MM/
USAGE
rotate-www-logs NEW-LOG-FILENAME
EOF
exit
}
die(){ printf "\033[0;7;31mError:\033[0m %s\n" "${1}" 1>&2; exit 1; }
if [ $# -eq 1 ]; then if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then usage; fi fi
# End boilerplate
if [ $# -eq 0 ]; then
usage
fi
# Apache has a habit of starting this script twice, which can stomp on its own files
for pid in $(pidof -x rotate-www-logs); do
if [ "${pid}" != $$ ]; then
# We echo and exit instead of die() because Apache prints stderr to the log, but not stdout. We don't need this logged.
echo "rotate-www-logs is already running with PID ${pid}"
exit 1
fi
done
# Prevent the loop from entering if no matches are found for the pattern
shopt -s nullglob
filenameBase=$(basename "$1" | sed --regexp-extended "s/\.[0-9]+$//")
directory=$(dirname "$1")
for filename in ${directory}/${filenameBase}.*; do
# When Apache calls this script, it passes the filename of the new log file it created.
# Thus, we check here to make sure we don't process and then delete the brand-new log file!
if [ "${filename}" != "$1" ]; then
# Apache log files can have data for more than one day. Here we pull out entries for different days into different files.
dates=$(grep --extended-regexp --only-matching "\[[0-9]{1,2}\/[a-zA-Z]{3}\/20[0-9]{2}" "${filename}" | sort -u)
while read -r line; do
logRawDate=$(echo "${line}" | sed "s/\[//g" | sed "s/\// /g")
logDate=$(date -d"${logRawDate}" "+%Y-%m-%d")
logMonth=$(date -d"${logRawDate}" "+%Y-%m")
grepString=${line//\[/}
logFilename="www-access-${logDate}.log"
mkdir -p "${directory}/${logMonth}"
# Is the log file already existing and gzipped?
if [ -f "${directory}/${logMonth}/${logFilename}.gz" ]; then
gunzip "${directory}/${logMonth}/${logFilename}.gz"
fi
# ipv6loganon is provided by the `ipv6calc` package
grep --extended-regexp "\[${grepString}" "${filename}" | ipv6loganon --anonymize-paranoid >> "${directory}/${logMonth}/${logFilename}"
gzip --best "${directory}/${logMonth}/${logFilename}"
chown --preserve-root --recursive www-data:adm "${directory}/${logMonth}"
chmod --preserve-root --recursive g+w "${directory}/${logMonth}"
done <<< "${dates}"
rm "${filename}"
fi
done