Skip to content

Scrape

Scrape #5673

Workflow file for this run

name: "Scrape"
on:
workflow_dispatch:
schedule:
- cron: "0 3,6,9,12,15,18,21 * * *"
- cron: "0 0 * * *"
jobs:
scrape:
name: Scrape
runs-on: ubuntu-latest
env:
FETCH_NUM_MONTHS: "${{ github.event.schedule == '0 0 * * *' && '12' || '3' }}"
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
cache: 'pip'
- name: Install Python dependencies
run: pip install -r requirements.txt
- name: Install VPN
run: wget --quiet --content-disposition https://github.com/mullvad/mullvadvpn-app/releases/download/2025.2/MullvadVPN-2025.2_amd64.deb && sudo apt install -y ./MullvadVPN-2025.2_amd64.deb
- name: Connect to VPN
run: |-
mullvad account login ${{ secrets.MULLVAD_ACCOUNT_NUMBER }}
mullvad relay set location us
mullvad connect --wait
- name: Fetch
run: python scripts/00-fetch.py --overwrite --expand --num-months $FETCH_NUM_MONTHS
- name: Log out of VPN
if: always()
run: |-
mullvad disconnect
mullvad account logout
- name: Config git
run: git config --global user.email "[email protected]" && git config --global user.name "Automated"
- name: Commit changes
run: git add data/fetched && (git diff --cached --quiet || git commit -m "Refresh last $FETCH_NUM_MONTHS months of data")
- name: Get discovery dates
run: python scripts/01-get-discovery-dates.py --num-months $FETCH_NUM_MONTHS
- name: Create filtered datasets
run: python scripts/02-filter.py
- name: Write RSS
run: python scripts/03-generate-rss.py
- name: Commit changes
run: git add data/processed && (git diff --cached --quiet || git commit -m "Generate discovery dates and RSS")
- name: Push changes
run: git push