Skip to content

Commit

Permalink
Anonymize web logs after rotating and disable explicit download loggi…
Browse files Browse the repository at this point in the history
…ng in favor of grepping the regular web logs
  • Loading branch information
acabal committed Mar 16, 2022
1 parent 1ea3b2f commit 1e698f2
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ PHP 7+ is required.

```shell
# Install Apache, PHP, PHP-FPM, and various other dependencies.
sudo apt install -y git composer php-fpm php-cli php-gd php-xml php-apcu php-mbstring php-intl apache2 apache2-utils libfcgi0ldbl task-spooler
sudo apt install -y git composer php-fpm php-cli php-gd php-xml php-apcu php-mbstring php-intl apache2 apache2-utils libfcgi0ldbl task-spooler ipv6calc

# Create the site root and logs root and clone this repo into it.
sudo mkdir /standardebooks.org/
Expand Down
10 changes: 2 additions & 8 deletions config/apache/standardebooks.org.conf
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,16 @@ Define domain standardebooks.org
DocumentRoot /standardebooks.org/web/www
ErrorDocument 404 /404
ErrorLog /var/log/local/www-error.log
DirectorySlash Off
RewriteEngine on
CustomLog "|/usr/bin/rotatelogs -f -p /standardebooks.org/scripts/rotate-www-logs /var/log/local/apache/www-access.log 86400" combined
CustomLog "|/usr/bin/rotatelogs -f -p /standardebooks.org/web/scripts/rotate-www-logs /var/log/local/apache/www-access.log 86400" combined

SSLEngine on
SSLCertificateFile /etc/letsencrypt/live/${domain}/fullchain.pem
SSLCertificateKeyFile /etc/letsencrypt/live/${domain}/privkey.pem
Header always set Strict-Transport-Security "max-age=15768000"
Header set Content-Security-Policy "default-src 'self';"

# Log downloads
SetEnvIf Request_URI "\.epub$" logdownload
SetEnvIf Request_URI "\.kepub.epub$" logdownload
SetEnvIf Request_URI "\.azw3$" logdownload
CustomLog /var/log/local/downloads.log "%h [%{%Y-%m-%d %H:%M:%S %Z}t] \"%r\" %>s %b" env=logdownload
DirectorySlash Off

<Directory /standardebooks.org/web/www/>
# Disable .htaccess files
AllowOverride none
Expand Down
8 changes: 1 addition & 7 deletions config/apache/standardebooks.test.conf
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Define domain standardebooks.test
DocumentRoot /standardebooks.org/web/www
ErrorDocument 404 /404
ErrorLog /var/log/local/www-error.log
DirectorySlash Off
RewriteEngine on

SSLEngine on
Expand All @@ -68,13 +69,6 @@ Define domain standardebooks.test
Header always set Strict-Transport-Security "max-age=15768000"
Header set Content-Security-Policy "default-src 'self';"

# Log downloads
SetEnvIf Request_URI "\.epub$" logdownload
SetEnvIf Request_URI "\.kepub.epub$" logdownload
SetEnvIf Request_URI "\.azw3$" logdownload
CustomLog /var/log/local/downloads.log "%h [%{%Y-%m-%d %H:%M:%S %Z}t] \"%r\" %>s %b" env=logdownload
DirectorySlash Off

<Directory /standardebooks.org/web/www/>
# Disable .htaccess files
AllowOverride none
Expand Down
72 changes: 72 additions & 0 deletions scripts/rotate-www-logs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash

usage(){
fmt <<EOF
DESCRIPTION
Moves Apache access log files into a by-month subdirectory, and gzip them.
This script must be run as root, and is generally run by the Apache rotatelogs subprocess as such.
Log files are moved to <LOG-DIR>/apache/YYYY-MM/
USAGE
rotate-www-logs NEW-LOG-FILENAME
EOF
exit
}
die(){ printf "\033[0;7;31mError:\033[0m %s\n" "${1}" 1>&2; exit 1; }
if [ $# -eq 1 ]; then if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then usage; fi fi
# End boilerplate

if [ $# -eq 0 ]; then
usage
fi

# Apache has a habit of starting this script twice, which can stomp on its own files
for pid in $(pidof -x rotate-www-logs); do
if [ "${pid}" != $$ ]; then
# We echo and exit instead of die() because Apache prints stderr to the log, but not stdout. We don't need this logged.
echo "rotate-www-logs is already running with PID ${pid}"
exit 1
fi
done

# Prevent the loop from entering if no matches are found for the pattern
shopt -s nullglob

filenameBase=$(basename "$1" | sed --regexp-extended "s/\.[0-9]+$//")
directory=$(dirname "$1")

for filename in ${directory}/${filenameBase}.*; do
# When Apache calls this script, it passes the filename of the new log file it created.
# Thus, we check here to make sure we don't process and then delete the brand-new log file!
if [ "${filename}" != "$1" ]; then
# Apache log files can have data for more than one day. Here we pull out entries for different days into different files.
dates=$(grep --extended-regexp --only-matching "\[[0-9]{1,2}\/[a-zA-Z]{3}\/20[0-9]{2}" "${filename}" | sort -u)

while read -r line; do
logRawDate=$(echo "${line}" | sed "s/\[//g" | sed "s/\// /g")
logDate=$(date -d"${logRawDate}" "+%Y-%m-%d")
logMonth=$(date -d"${logRawDate}" "+%Y-%m")
grepString=${line//\[/}
logFilename="www-access-${logDate}.log"

mkdir -p "${directory}/${logMonth}"

# Is the log file already existing and gzipped?
if [ -f "${directory}/${logMonth}/${logFilename}.gz" ]; then
gunzip "${directory}/${logMonth}/${logFilename}.gz"
fi

# ipv6loganon is provided by the `ipv6calc` package
grep --extended-regexp "\[${grepString}" "${filename}" | ipv6loganon --anonymize-paranoid >> "${directory}/${logMonth}/${logFilename}"

gzip --best "${directory}/${logMonth}/${logFilename}"

chown --preserve-root --recursive www-data:adm "${directory}/${logMonth}"
chmod --preserve-root --recursive g+w "${directory}/${logMonth}"
done <<< "${dates}"

rm "${filename}"
fi
done

0 comments on commit 1e698f2

Please sign in to comment.