Merge pull request #1 from bcgov/feature/DBC22-2113

DBC22-2113: Update logging process
bcgov · May 10, 2024 · 8b0f2d5 · 8b0f2d5
2 parents a2eedf1 + e746007
commit 8b0f2d5
Show file tree

Hide file tree

Showing 16 changed files with 621 additions and 26 deletions.
diff --git a/compose/caching/openshiftjobs/DockerFile b/compose/caching/openshiftjobs/DockerFile
@@ -0,0 +1,21 @@
+FROM alpine:3
+RUN apk update && apk upgrade
+#Need goaccess 1.92 for a timezone fix. Once that version is in the regular branch, we can pull it from there.
+RUN apk add goaccess --repository=https://dl-cdn.alpinelinux.org/alpine/edge/main
+
+RUN apk add --no-cache \
+    aws-cli \
+    bash \
+    coreutils \
+    tzdata
+
+COPY ./compose/openshiftjobs/entrypoint.sh /
+COPY ./compose/openshiftjobs/scripts/analyzeexportlogs.sh /scripts/
+COPY ./compose/openshiftjobs/scripts/ziplogs.sh /scripts/
+
+RUN sed -i 's/\r$//g' /entrypoint.sh && chmod +x /entrypoint.sh
+RUN sed -i 's/\r$//g' /scripts/analyzeexportlogs.sh && chmod +x /scripts/analyzeexportlogs.sh
+RUN sed -i 's/\r$//g' /scripts/ziplogs.sh && chmod +x /scripts/ziplogs.sh
+
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/compose/caching/openshiftjobs/entrypoint.sh b/compose/caching/openshiftjobs/entrypoint.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Check the number of arguments
+if [ "$#" -lt 1 ]; then
+    echo "Usage: $0 <script_name> [args...]"
+    exit 1
+fi
+
+# Determine which script to run based on the first argument
+case "$1" in
+    ziplogs)
+        # Run ziplogs.sh which will zip all files that were created in the previous hour or older in the nginx log storage pvc
+        /scripts/ziplogs.sh
+        ;;
+    analyzeexportlogs)
+        # Run analyzeexportlogs with additional arguments which will send the specified days logs through goaccess and then upload to s3.
+        shift # Remove the first argument (script number)
+        /scripts/analyzeexportlogs.sh "$@"
+        ;;
+    *)
+        echo "Invalid script"
+        exit 1
+        ;;
+esac
diff --git a/compose/caching/openshiftjobs/scripts/analyzeexportlogs.sh b/compose/caching/openshiftjobs/scripts/analyzeexportlogs.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+# Directory where the logs are stored
+log_dir="/logs"
+cd $log_dir || { echo "Failed to change directory to $log_dir. Exiting..."; exit 1; }
+
+# Initialize an empty array
+zipped_files=()
+
+# Get the number of days ago as a positional parameter
+days_ago="$1"
+
+# If no argument is provided, default to 1 day ago
+if [ -z "$days_ago" ]; then
+    days_ago=1
+fi
+
+# Get the start and end time for $days_ago days ago in the America/Vancouver timezone
+start_time=$(TZ=America/Vancouver date -d "$days_ago days ago 00:00")
+end_time=$(TZ=America/Vancouver date -d "$days_ago days ago 23:59")
+
+# Convert the start and end time to UTC
+start_time_utc=$(date -u -d "$start_time" +%Y%m%d%H)
+end_time_utc=$(date -u -d "$end_time" +%Y%m%d%H)
+
+echo "Will analyze and archive logs between $start_time_utc and $end_time_utc"
+
+# Loop through all files in the current directory ending with ".gz"
+for file in *.gz; do
+    # Check if the file exists and is a regular file
+    if [[ -f $file ]]; then
+        # Extract the date and time part from the filename (assuming UTC timezone)
+        file_datetime_utc=$(echo "$file" | grep -oE '[0-9]{10}')
+        # Check if the file date and time in UTC falls within days_ago's start and end time in UTC
+        if [[ $file_datetime_utc -ge $start_time_utc && $file_datetime_utc -le $end_time_utc ]]; then
+            zipped_files+=("$file")
+        fi
+    fi
+done
+
+# Print the elements of the array
+echo "Log files that will be processed:"
+printf '%s\n' "${zipped_files[@]}"
+
+# Get the start dates formatted for the goaccess report name
+start_time_formatted=$(date -d "$start_time" +%Y%m%d)
+
+# Run the following command only if zipped_files array is not empty
+if [ ${#zipped_files[@]} -gt 0 ]; then
+    # Define file URL and destination directory to pull down the latest geoip data
+    FILE_URL="https://download.db-ip.com/free/dbip-city-lite-$(date -u +'%Y-%m').mmdb.gz"
+    OLD_FILE="dbip-city-lite-$(date -d 'last month' +'%Y-%m').mmdb"
+
+    # Check if the file exists
+    if [ ! -f "dbip-city-lite-$(date -u +'%Y-%m').mmdb" ]; then
+        # If the file doesn't exist, download it
+        echo "Downloading file..."
+        if wget -q --spider "$FILE_URL"; then
+            wget "$FILE_URL" -P "$log_dir"
+            echo "Download complete."
+            gzip -d "dbip-city-lite-$(date -u +'%Y-%m').mmdb.gz"
+            # Delete the old file if it exists
+            if [ -f "$OLD_FILE" ]; then
+                echo "Deleting old file ($OLD_FILE)..."
+                rm "$OLD_FILE"
+                echo "Old file deleted."
+            fi
+        else
+            echo "Failed to download file. URL is unreachable."
+        fi
+    else
+        echo "MMDB file already exists."
+    fi
+    mmdb_file=$(find . -maxdepth 1 -type f -name "*.mmdb")
+
+    #Run goaccess on all the log files from the date entered
+    goaccess_report_name=$start_time_formatted-goaccess_report.html
+    zcat "${zipped_files[@]}" | goaccess - -o "$goaccess_report_name" --log-format='%h %e %^[%x] "%r" %s %b "%R" "%u" %C "%M" %T' --datetime-format='%d/%b/%Y:%H:%M:%S %z' --ignore-panel=REMOTE_USER --ignore-panel=ASN --tz=America/Vancouver --jobs=2 --geoip-database=$mmdb_file
+    echo "GoAccess report generated successfully at $goaccess_report_name"
+
+    # Get the start date formated in YYYY/MM/DD format
+    start_time_formatted_s3=$(date -d "$start_time" +"%Y/%m/%d")
+
+    # Create folder structure in S3 bucket
+    s3_path="s3://$AWS_BUCKET/$ENVIRONMENT/logs/$start_time_formatted_s3/"
+
+    # Upload zipped files to S3
+    for file in "${zipped_files[@]}"; do
+        aws --endpoint-url "$AWS_ENDPOINT" s3 cp "$file" "$s3_path" || { echo "Failed to upload $file to S3. Exiting..."; exit 1; }
+        echo "File $file copied to S3 bucket under $s3_path"
+    done
+
+    echo "All files copied to S3 bucket under $s3_path"
+
+
+    # Upload HTML report to S3
+    aws --endpoint-url $AWS_ENDPOINT s3 cp "$goaccess_report_name" "$s3_path" || { echo "Failed to upload HTML report to S3. Exiting..."; exit 1; }
+    echo "HTML report copied to S3 bucket under $s3_path"
+
+    # Delete the zipped files and HTML report
+    rm "${zipped_files[@]}" "$goaccess_report_name"
+
+    echo "Zipped Files and HTML report deleted from PVC successfully"
+
+else
+    echo "No files to process for $start_time_formatted"
+fi
diff --git a/compose/caching/openshiftjobs/scripts/ziplogs.sh b/compose/caching/openshiftjobs/scripts/ziplogs.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Calculate the hour one hour ago in UTC
+previous_hour=$(TZ=UTC date -d '1 hour ago' +"%Y%m%d%H")
+echo "Previous Hour in UTC: $previous_hour"
+
+# Directory where the logs are stored
+log_dir="./logs"
+
+# Iterate over log files
+find "$log_dir" -type f -name '*.log' | while read -r file; do
+    # Extract timestamp from filename
+    timestamp=$(echo "$file" | grep -oE '[0-9]{10}')
+
+    # Check if timestamp is less than or equal to previous hour
+    if [[ $timestamp -le $previous_hour ]]; then
+        # gzip the file
+        gzip "$file"
+        echo "File $file with timestamp $timestamp gzipped."
+    fi
+done
diff --git a/helm/README.md b/helm/README.md
@@ -1,30 +1,67 @@
 # Image Caching Chart
 
-A chart to provision an nginx image caching instance
+A Chart to provision an instance of an nginx proxy cache which caches content from images.drivebc.ca to reduce load on those servers. It also includes components to zip and then process logs before uploading them to s3 storage.
 
 ## Configuration
 
 ### Image Caching Options
 
-| Parameter               | Description                             | Default                                |
-| ----------------------- | --------------------------------------- | -------------------------------------- |
-| `fullnameOverride`      | Instance Name if other than default     | `image-caching`                        |
-| `nameOverride`          | Instance Name if other than default     | `image-caching`                        |
-| `replicaCount`          | Amount of replicas to run               | `1`                                    |
-| `repository`            | Image Source                            | `ghcr.io/bcgov/drivebc-image-caching`  |
-| `tag`                   | Image Tag                               | `latest`                               |
-| `CPU Request`           | CPU Request Amount                      | `50`                                   |
-| `CPU Limit`             | CPU Limit Amount                        | `250`                                  |
-| `Memory Request`        | Memory Requests Amount                  | `50`                                   |
-| `Memory Limit`          | Memory Limit Amount                     | `100`                                  |
-| `Autoscaling`           | Autoscaling enabled?                    | `true`                                 |
-| `min replicas`          | Minimum amount of replicas              | `1`                                    |
-| `max replicas`          | Maxiumum amount of replicas             | `2`                                    |
-| `networkPolicyRequired` | Do you require default network policies | `false`                                |
-| `route enabled`         | Do you wan it to create a route         | `true                                  |
-| `route host`            | What hostname do you want               | `drivebc.apps.silver.devops.gov.bc.ca` |
-| `iprestricted`          | Should it be IP Restricted?             | `false`                                |
-| `ipallowlist`           | What IP's are allowed to connect?       |                                        |
+| Parameter                        | Description                                                       | Default                  |
+| -------------------------------- | ----------------------------------------------------------------- | ------------------------ |
+| fullnameOverride:                | The full name override for the deployment                           | `drivebc-cache`         |
+| nameOverride:                    | The name override for the deployment                               | `drivebc-cache`         |
+| replicaCount:                    | The number of replicas for the deployment                          | `1`                      |
+| image:                           |                                                                   |                          |
+|   repository:                    | The repository containing the Docker image for the deployment     | `ghcr.io/bcgov/drivebc.ca-caching` |
+|   tag:                           | The tag of the Docker image used for the deployment                | `latest`                 |
+| deployment:                      |                                                                   |                          |
+|   resources:                     |                                                                   |                          |
+|     requests:                    | The resource requests (CPU and Memory) for the deployment         | CPU: `50m`, Memory: `50Mi`  |
+|     limits:                      | The resource limits (CPU and Memory) for the deployment           | CPU: `250m`, Memory: `100Mi`|
+|   env:                           |                                                                   |                          |
+|     DRIVEBC_IMAGE_BASE_URL:      | The base URL for images used by the deployment                    | `https://tst-images.drivebc.ca/` |
+| autoscaling:                     |                                                                   |                          |
+|   enabled:                       | Specifies whether autoscaling is enabled for the deployment       | `true`                   |
+|   minReplicas:                   | The minimum number of replicas when autoscaling is enabled        | `1`                      |
+|   maxReplicas:                   | The maximum number of replicas when autoscaling is enabled        | `2`                      |
+|   targetCPUUtilizationPercentage:| The target CPU utilization percentage for autoscaling             | `80`                     |
+| networkPolicyRequired:           | Set to true if you need to allow traffic between pods and internet ingress setup | `true`         |
+| route:                           |                                                                   |                          |
+|   enabled:                       | Specifies whether the route is enabled                             | `true`                   |
+|   host:                          | The host for the route                                             | `drivebc-cache.apps.silver.devops.gov.bc.ca` |
+|   iprestricted:                  | Set to true if you want to limit IPs in the ipallowlist            | `false`                  |
+|   ipallowlist:                   | The list of allowed IP addresses                                   | `142.34.53.0/24 142.22.0.0/15 142.24.0.0/13 142.32.0.0/13 208.181.128.46/32` |
+| logpvc:                          |                                                                   |                          |
+|   storage:                       | The storage size for logs                                          | `1Gi`                    |
+| cronjobs:                        |                                                                   |                          |
+|   analyzeuploadlogs:             |                                                                   |                          |
+|     name:                        | The name of the cronjob                                            | `analyzeuploadlogs`       |
+|     schedule:                    | The cron schedule for the job (in UTC)                             | `0 9 * * *`              |
+|     deployment:                  |                                                                   |                          |
+|       resources:                 |                                                                   |                          |
+|         requests:                | The resource requests (CPU and Memory) for the job                 | CPU: `50m`, Memory: `1Gi`   |
+|         limits:                  | The resource limits (CPU and Memory) for the job                   | CPU: `2000m`, Memory: `2Gi` |
+|       env:                       |                                                                   |                          |
+|         s3Secret:               | The secret for accessing the S3 bucket                             | `drivebc-cronjob-s3bucket` |
+|         environment:            | The environment for the job                                        | `dev`                    |
+|       volumes:                   |                                                                   |                          |
+|         logs:                    | The volume mount for logs                                          | `static-log-storage`      |
+|     s3secret:                    |                                                                   |                          |
+|       name:                      | The name of the S3 secret                                          | `drivebc-cronjob-s3bucket` |
+|       access_key_id:             | The access key ID for the S3 bucket (Do not commit to GitHub)     | `""`                     |
+|       bucket:                    | The bucket name for the S3 bucket (Do not commit to GitHub)        | `""`                     |
+|       endpoint:                  | The endpoint for the S3 bucket (Do not commit to GitHub)           | `""`                     |
+|       secret_access_key:         | The secret access key for the S3 bucket (Do not commit to GitHub)  | `""`                     |
+|     name:                        | The name of the cronjob                                            | `ziplogs`                |
+|     schedule:                    | The cron schedule for the job                                      | `30 * * * *`             |
+|     deployment:                  |                                                                   |                          |
+|       resources:                 |                                                                   |                          |
+|         requests:                | The resource requests (CPU and Memory) for the job                 | CPU: `50m`, Memory: `100Mi` |
+|         limits:                  | The resource limits (CPU and Memory) for the job                   | CPU: `150m`, Memory: `200Mi`|
+|       volumes:                   |                                                                   |                          |
+|         logs:                    | The volume mount for logs                                          | `static-log-storage`      |
+
+
 
 
 ## Components
@@ -34,3 +71,6 @@ A chart to provision an nginx image caching instance
 - Deployment
 - HPA
 - Network Policy
+- Cronjob
+  - Analyze Upload Logs
+  - Zip Logs
diff --git a/helm/templates/analyzeuploadlogs-cronjob.yaml b/helm/templates/analyzeuploadlogs-cronjob.yaml
@@ -0,0 +1,57 @@
+{{- $deploymentTag := .Values.image.tag | default .Chart.AppVersion -}}
+{{- $deploymentTime := now | date "2006-01-02 15:04:05.000000" -}}
+
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: {{ template "app.fullname" . }}-{{ .Values.cronjobs.analyzeuploadlogs.name }}
+  labels: {{ include "app.labels" . | nindent 4 }}
+
+spec:
+  schedule: {{ .Values.cronjobs.analyzeuploadlogs.schedule }}
+  concurrencyPolicy: Replace
+  suspend: false
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          volumes:
+            - name: log-storage
+              persistentVolumeClaim:
+                claimName: {{ .Values.cronjobs.analyzeuploadlogs.deployment.volumes.logs  }}
+          containers:
+            - resources:
+                limits:
+                  cpu: {{ .Values.cronjobs.analyzeuploadlogs.deployment.resources.limits.cpu  }}
+                  memory: {{ .Values.cronjobs.analyzeuploadlogs.deployment.resources.limits.memory  }}
+                requests:
+                  cpu: {{ .Values.cronjobs.analyzeuploadlogs.deployment.resources.requests.cpu  }}
+                  memory: {{ .Values.cronjobs.analyzeuploadlogs.deployment.resources.requests.memory  }}
+              name: {{ include "app.fullname" . }}-{{ .Values.cronjobs.analyzeuploadlogs.name }}
+              args:
+                - "analyzeexportlogs"
+                - "1"
+              volumeMounts:
+                - name: log-storage
+                  mountPath: /logs
+              env:
+                - name: DEPLOYMENT_TAG
+                  value: {{ $deploymentTag | quote }}
+                - name: DEPLOYMENT_TIME
+                  value: {{ $deploymentTime | quote }}
+                - name: ENVIRONMENT
+                  value: {{ .Values.cronjobs.analyzeuploadlogs.deployment.env.environment  }}
+              envFrom:
+                - secretRef:
+                    name: {{ .Values.cronjobs.analyzeuploadlogs.deployment.env.s3Secret  }}
+              imagePullPolicy: IfNotPresent
+              image: {{ .Values.image.repository  }}:{{ .Values.image.tag  }}
+              securityContext:
+                seccompProfile:
+                  type: 'RuntimeDefault'
+                capabilities:
+                  drop:
+                    - all
+                  add:
+                    - NET_BIND_SERVICE
+          restartPolicy: Never
diff --git a/helm/templates/analyzeuploadlogs-secret.yaml b/helm/templates/analyzeuploadlogs-secret.yaml
@@ -0,0 +1,14 @@
+{{- if not (lookup "v1" "Secret" .Release.Namespace .Values.cronjobs.analyzeuploadlogs.s3secret.name) }}
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ .Values.cronjobs.analyzeuploadlogs.s3secret.name }}
+  annotations:
+    "helm.sh/resource-policy": "keep"
+type: Opaque
+data:
+  AWS_ACCESS_KEY_ID: {{ .Values.cronjobs.analyzeuploadlogs.s3secret.access_key_id | b64enc }}
+  AWS_BUCKET: {{ .Values.cronjobs.analyzeuploadlogs.s3secret.bucket | b64enc }}
+  AWS_ENDPOINT: {{ .Values.cronjobs.analyzeuploadlogs.s3secret.endpoint | b64enc }}
+  AWS_SECRET_ACCESS_KEY: {{ .Values.cronjobs.analyzeuploadlogs.s3secret.secret_access_key | b64enc }}
+{{- end }}