flathunters · codders · Aug 30, 2022 · Aug 28, 2022 · Aug 28, 2022 · Aug 28, 2022
diff --git a/.gitignore b/.gitignore
@@ -100,8 +100,8 @@ config.yaml
 # Vscode
 .vscode
 
-# Chromedriver
-chromedriver/
+# Chromedriver / Webdriver Manager
+.wdm/
 
 # Pip requirements - should not be committed
 requirements.txt
diff --git a/Dockerfile.gcloud.job b/Dockerfile.gcloud.job
@@ -0,0 +1,25 @@
+FROM python:3.7
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+ARG PIP_NO_CACHE_DIR=1
+
+# Install Google Chrome
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
+RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
+RUN apt-get -y update
+RUN apt-get install -y google-chrome-stable
+
+# Copy files
+WORKDIR /usr/src/app
+COPY . .
+
+# Upgrade pip, install pipenv
+RUN pip install --upgrade pip
+
+# Generate requirements.txt and install dependencies from there
+RUN pip install -r requirements.txt
+
+RUN python chrome_driver_install.py
+
+CMD python cloud_job.py
diff --git a/Pipfile b/Pipfile
@@ -27,6 +27,7 @@ beautifulsoup4 = "*"
 pylint-runner = "*"
 webdriver-manager = "*"
 apprise = "*"
+python-dotenv = "*"
 
 [dev-packages]
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -161,15 +161,35 @@ First build the image inside the project's root directory:
 $ docker build -t flathunter .
 ```
 
-**When running a container using the image, a config file needs to be mounted on the container at ```/config.yaml```.** The example below provides the file ```config.yaml``` off the current working directory:
+**When running a container using the image, a config file needs to be mounted on the container at ```/config.yaml``` or configuration has to be supplied using environment variables.** The example below provides the file ```config.yaml``` off the current working directory:
 
 ```sh
 $ docker run --mount type=bind,source=$PWD/config.yaml,target=/config.yaml flathunter
 ```
 
+#### Environment Configuration
+
+To make deployment with docker easier, most of the important configuration options can be set with environment variables. The current list of recognised variables includes:
+
+ - FLATHUNTER_TARGET_URLS - a semicolon-separated list of URLs to crawl
+ - FLATHUNTER_DATABASE_LOCATION - the location on disk of the sqlite database if required
+ - FLATHUNTER_GOOGLE_CLOUD_PROJECT_ID - the Google Cloud Project ID, for Google Cloud deployments
+ - FLATHUNTER_VERBOSE_LOG - set to any value to enable verbose logging
+ - FLATHUNTER_LOOP_PERIOD_SECONDS - a number in seconds for the crawling interval
+ - FLATHUNTER_MESSAGE_FORMAT - a format string for the notification messages, where `#CR#` will be replaced by newline
+ - FLATHUNTER_NOTIFIERS - a comma-separated list of notifiers to enable (e.g. `telegram,mattermost`)
+ - FLATHUNTER_TELEGRAM_BOT_TOKEN - the token for the Telegram notifier
+ - FLATHUNTER_TELEGRAM_RECEIVER_IDS - a comma-separated list of receiver IDs for Telegram notifications
+ - FLATHUNTER_MATTERMOST_WEBHOOK_URL - the webhook URL for Mattermost notifications
+ - FLATHUNTER_WEBSITE_SESSION_KEY - the secret session key used to secure sessions for the flathunter website deployment
+ - FLATHUNTER_WEBSITE_DOMAIN - the public domain of the flathunter website deployment
+ - FLATHUNTER_2CAPTCHA_KEY - the API key for 2captcha
+ - FLATHUNTER_IMAGETYPERZ_TOKEN - the API token for ImageTyperz
+ - FLATHUNTER_HEADLESS_BROWSER - set to any value to configure Google Chrome to be launched in headless mode (necessary for Docker installations)
+
 ### Google Cloud Deployment
 
-You can run `Flathunter` on Google's App Engine, in the free tier, at no cost. To get started, first install the [Google Cloud SDK](https://cloud.google.com/sdk/docs) on your machine, and run:
+You can run `Flathunter` on Google's App Engine, in the free tier, at no cost if you don't need captcha solving. If you need to solve captchas, you can use Google Cloud Run as described later. To get started, first install the [Google Cloud SDK](https://cloud.google.com/sdk/docs) on your machine, and run:
 
 ```
 $ gcloud init
@@ -181,13 +201,15 @@ to setup the SDK. You will need to create a new cloud project (or connect to an
 $ gcloud config set project flathunters
 ```
 
-You will need to provide the project ID to the configuration file `config.yaml` as value to the key `google_cloud_project_id`.
+You will need to provide the project ID to the configuration file `config.yaml` as value to the key `google_cloud_project_id` or in the `FLATHUNTER_GOOGLE_CLOUD_PROJECT_ID` environment variable.
 
 Google Cloud [doesn't currently support Pipfiles](https://stackoverflow.com/questions/58546089/does-google-app-engine-flex-support-pipfile). To work around this restriction, the `Pipfile` and `Pipfile.lock` have been added to `.gcloudignore`, and a `requirements.txt` file has been generated using `pip freeze`. 
 
 If the Pipfile has been updated, you will need to remove the line `pkg-resources==0.0.0` from `requirements.txt` for a successful deploy.
 
-To deploy the app, run:
+#### Google App Engine Deployment
+
+To deploy the app to Google App Engine, run:
 
 ```
 $ gcloud app deploy
@@ -201,6 +223,36 @@ Instead of running with a timer, the web interface depends on periodic calls to
 $ gcloud app deploy cron.yaml
 ```
 
+#### Google Cloud Run Deployment
+
+If you need captcha support (for example to scrape Immoscout), you will need to deploy using [Google Cloud Run](https://cloud.google.com/run/), so that you can embed the Chrome browser and Selenium Webdriver in the docker image. A seperate `Dockerfile.gcloud.job` exists for this purpose.
+
+First, ensure that `requirements.txt` has been created (per [Google Cloud Deployment](#google-cloud-deployment)), then either run:
+
+```
+docker build -t flathunter-job -f Dockerfile.gcloud.job .
+```
+
+to build the docker image locally, or edit the `cloudbuild.yaml` file to point to the container registry for your own Google Cloud Project, and run:
+
+```
+gcloud builds submit --region=europe-west1
+```
+
+to have [Google Cloud Build](https://cloud.google.com/build) build and tag the image for you.
+
+You will need to create a new [Google Cloud Run Job](https://console.cloud.google.com/run/jobs) to execute the crawl/notify. The job should be configured with 1GB of memory and 1 CPU, and the environment variables to should be set appropriately.
+
+You can trigger the job using [Google Cloud Scheduler](https://console.cloud.google.com/cloudscheduler), using an HTTP POST to:
+
+```
+https://[REGION]-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/[PROJECT_ID]/jobs/[JOB_NAME]:run
+```
+
+For more information checkout [the Cloud Scheduler documentation](https://cloud.google.com/run/docs/execute/jobs-on-schedule).
+
+Because the image uses Firestore to read details of user notification preferences and store crawled exposes, the job can run without any additional configuration. If you are hosting the webinterface somewhere on Google Cloud (either App Engine or Google Cloud Run), the job here will find the appropriate Firebase database.
+
 ## Usage
 
 ### Command-line Interface

diff --git a/chrome_driver_install.py b/chrome_driver_install.py
@@ -0,0 +1,11 @@
+import logging
+import os
+
+from flathunter.logging import wdm_logger
+from webdriver_manager.chrome import ChromeDriverManager
+
+# Cache the driver manager to local folder so that gunicorn can find it
+os.environ['WDM_LOCAL'] = '1'
+wdm_logger.setLevel(logging.INFO)
+
+ChromeDriverManager().install()
diff --git a/cloud_job.py b/cloud_job.py
@@ -0,0 +1,26 @@
+""" Startup file for Google Cloud deployment or local webserver"""
+import logging
+import os
+
+from flathunter.googlecloud_idmaintainer import GoogleCloudIdMaintainer
+from flathunter.web_hunter import WebHunter
+from flathunter.config import Config
+from flathunter.logging import logger, wdm_logger, configure_logging
+
+from flathunter.web import app
+
+config = Config()
+
+# Load the driver manager from local cache (if chrome_driver_install.py has been run
+os.environ['WDM_LOCAL'] = '1'
+# Use Google Cloud DB if we run on the cloud
+id_watch = GoogleCloudIdMaintainer()
+
+configure_logging(config)
+
+# initialize search plugins for config
+config.init_searchers()
+
+hunter = WebHunter(config, id_watch)
+
+hunter.hunt_flats()
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -0,0 +1,10 @@
+steps:
+# Build the container image
+- name: 'gcr.io/cloud-builders/docker'
+  args: ['build', '-t', 'gcr.io/flathunters/flathunter-job', '-f', 'Dockerfile.gcloud.job', '.']
+# Push the container image to Container Registry
+- name: 'gcr.io/cloud-builders/docker'
+  args: ['push', 'gcr.io/flathunters/flathunter-job']
+images:
+- gcr.io/flathunters/flathunter-job
+
diff --git a/flathunt.py b/flathunt.py
@@ -8,12 +8,11 @@
 import os
 import logging
 import time
-from pprint import pformat
 
-from flathunter.logging import logger, wdm_logger
+from flathunter.logging import logger, wdm_logger, configure_logging
 from flathunter.idmaintainer import IdMaintainer
 from flathunter.hunter import Hunter
-from flathunter.config import Config
+from flathunter.config import Config, Env
 from flathunter.heartbeat import Heartbeat
 
 __author__ = "Jan Harrie"
@@ -31,10 +30,10 @@ def launch_flat_hunt(config, heartbeat=None):
     hunter.hunt_flats()
     counter = 0
 
-    while config.get('loop', {}).get('active', False):
+    while config.loop_is_active():
         counter += 1
         counter = heartbeat.send_heartbeat(counter)
-        time.sleep(config.get('loop', {}).get('sleeping_time', 60 * 10))
+        time.sleep(config.loop_period_seconds())
         hunter.hunt_flats()
 
 
@@ -45,7 +44,10 @@ def main():
                      " and sends results to Telegram User"),
         epilog="Designed by Nody"
     )
-    default_config_path = f"{os.path.dirname(os.path.abspath(__file__))}/config.yaml"
+    if Env.FLATHUNTER_TARGET_URLS is not None:
+        default_config_path = None
+    else:
+        default_config_path = f"{os.path.dirname(os.path.abspath(__file__))}/config.yaml"
     parser.add_argument('--config', '-c',
                         type=argparse.FileType('r', encoding='UTF-8'),
                         default=default_config_path,
@@ -62,38 +64,37 @@ def main():
 
     # load config
     config_handle = args.config
-    config = Config(config_handle.name)
+    if config_handle is not None:
+        config = Config(config_handle.name)
+    else:
+        config = Config()
 
-    # adjust log level, if required
-    if config.get('verbose'):
-        logger.setLevel(logging.DEBUG)
-        # Allow logging of "webdriver-manager" module on verbose mode
-        wdm_logger.setLevel(logging.INFO)
-
-    logger.debug("Settings from config: %s", pformat(config))
+    # setup logging
+    configure_logging(config)
 
     # initialize search plugins for config
     config.init_searchers()
 
     # check config
-    notifiers = config.get('notifiers', [])
+    notifiers = config.notifiers()
     if 'mattermost' in notifiers \
-            and not config.get('mattermost', {}).get('webhook_url'):
+            and not config.mattermost_webhook_url():
         logger.error("No Mattermost webhook configured. Starting like this would be pointless...")
         return
     if 'telegram' in notifiers:
-        if not config.get('telegram', {}).get('bot_token'):
+        if not config.telegram_bot_token():
             logger.error(
                 "No Telegram bot token configured. Starting like this would be pointless..."
             )
             return
-        if not config.get('telegram', {}).get('receiver_ids'):
+        if len(config.telegram_receiver_ids()) == 0:
             logger.warning("No Telegram receivers configured - nobody will get notifications.")
     if 'apprise' in notifiers \
             and not config.get('apprise', {}):
         logger.error("No apprise url configured. Starting like this would be pointless...")
         return
-    if not config.get('urls'):
+
+    if len(config.target_urls()) == 0:
         logger.error("No URLs configured. Starting like this would be pointless...")
         return