Merge pull request #3 from populationgenomics/first-test-workflow

First test workflow
populationgenomics · Jan 8, 2025 · 1cc6848 · 1cc6848
2 parents c64a271 + a2487ee
commit 1cc6848
Show file tree

Hide file tree

Showing 20 changed files with 2,164 additions and 333 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @populationgenomics/software-team
diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,5 @@ poetry.toml
 pyrightconfig.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python
+
+.vscode/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -16,14 +16,14 @@ repos:
       - id: check-added-large-files
 
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.42.0
+    rev: v0.43.0
     hooks:
       - id: markdownlint
         args: [-s, .markdownlint.json]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.7.3
+    rev: v0.8.3
     hooks:
       - id: ruff
         name: ruff (format)
@@ -42,7 +42,7 @@ repos:
             --install-types,
             --non-interactive,
           ]
-        additional_dependencies: [types-PyYAML==6.0.4, types-toml]
+        additional_dependencies: [mypy, types-PyYAML==6.0.4, types-toml, types-requests]
 
   - repo: https://github.com/opensource-nepal/commitlint
     rev: v1.3.0

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Centre for Population Genomics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,84 @@
+
+<h1 align="center">
+  <br>
+  <a href="http://www.amitmerchant.com/electron-markdownify"><img src="./assets/tws.jpg" alt="Markdownify" width="200"></a>
+  <br>
+  Test Workflows Shared
+  <br>
+</h1>
+
+<h4 align="center">A template test workflows repository that works with <a href="https://github.com/populationgenomics/cpg-flow" target="_blank">CPG Flow</a></h4>
+
+<p align="center">
+  <a href="https://img.shields.io/github/actions/workflow/status/populationgenomics/test_workflows_shared/security.yaml?style=for-the-badge&label=pip-audit">
+    <img alt="GitHub Actions Workflow Status" src="https://img.shields.io/github/actions/workflow/status/populationgenomics/test_workflows_shared/security.yaml?style=for-the-badge&label=pip-audit">
+  </a>
+  <a href="https://img.shields.io/github/license/populationgenomics/test_workflows_shared?style=for-the-badge
+  "><img alt="GitHub License" src="https://img.shields.io/github/license/populationgenomics/test_workflows_shared?style=for-the-badge">
+</a>
+  <a href="https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fpopulationgenomics%2Ftest_workflows_shared%2Fmain%2Fpyproject.toml&style=for-the-badge
+  ">
+      <img alt="Python Version from PEP 621 TOML" src="https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fpopulationgenomics%2Ftest_workflows_shared%2Fmain%2Fpyproject.toml&style=for-the-badge">
+
+  </a>
+</p>
+
+<p align="center">
+  <a href="#key-features">Key Features</a> •
+  <a href="#how-to-use">How To Use</a> •
+  <a href="#editing-in-an-ide">Editing in an IDE</a> •
+  <a href="#related">Related</a> •
+  <a href="#license">License</a>
+</p>
+
+
+## Key Features
+
+* Uses `uv` to manage dependencies
+* Uses `analysis-runner` to run the test workflow
+* The `jobs` and `stages` are defined in separate files:
+  * The `cpg_flow_test/jobs/` directory contains the job definitions that can be reused across stages.
+  * The `cpg_flow_test/stages.py` file contains the stage definitions, which call the jobs.
+* The `cpg_flow_test/workflow.py` file contains the test workflow definition.
+
+## How To Use
+
+From your command line:
+
+```bash
+# Clone this repository
+$ git clone https://github.com/populationgenomics/test_workflows_shared
+
+# Go into the repository
+$ cd test_workflows_shared
+
+# Go to the test folder
+$ cd cpg_flow_test
+
+# Run the test with the bash script
+$ chmod +x run-test-workflow.sh
+$ ./run-test-workflow.sh
+```
+> **Note**
+> You will need to have `analysis-runner` installed in your environment. See the [analysis-runner](https://github.com/populationgenomics/analysis-runner) for more information or install it with `pipx install analysis-runner`.
+
+## Editing in an IDE
+
+To enable syntax highlighting in your IDE, you will need to install dependencies.
+
+```bash
+# Install dependencies
+# `uv` documentation: https://docs.astral.sh/uv/
+$ uv sync
+
+# Activate the virtual environment
+$ source .venv/bin/activate
+```
+
+## Related
+
+[cpg-flow](https://github.com/populationgenomics/cpg-flow) - supports various stages of genomic data processing, from raw data ingestion to final analysis outputs, making it easier for researchers to manage and scale their population genomics workflows.
+
+## License
+
+[MIT](LICENSE)
diff --git a/assets/tws.jpg b/assets/tws.jpg
diff --git a/cpg_flow_test/__init__.py b/cpg_flow_test/__init__.py
diff --git a/config.toml → cpg_flow_test/config.toml b/config.toml → cpg_flow_test/config.toml
@@ -1,16 +1,25 @@
 [workflow]
 dataset = 'fewgenomes'
+
+# Note: for fewgenomes and sandbox mentioning datasets by name is not a security risk
+# DO NOT DO THIS FOR OTHER DATASETS
+# fewgenomes only cohort = COH2142
+# sandbox only cohort = COH2209
+# fewgenomes + sandbox cohort = COH2217
+
 input_cohorts = ['COH2142']
 access_level = 'test'
 
 # Force stage rerun
 force_stages = [
     'GeneratePrimes',
     'CumulativeCalc',
-    # 'FilterEvens',
-    # 'BuildAPrimePyramid',
+    'FilterEvens',
+    'BuildAPrimePyramid',
 ]
 
+status_reporter = 'metamist'
+
 # Cohorts to use as inputs.
 #input_cohorts = []
 

diff --git a/cpg_flow_test/jobs/__init__.py b/cpg_flow_test/jobs/__init__.py
@@ -0,0 +1,5 @@
+from jobs.build_pyramid import build_pyramid
+from jobs.cumulative_calc import cumulative_calc
+from jobs.filter_evens import filter_evens
+from jobs.first_n_primes import first_n_primes
+from jobs.iterative_digit_sum import iterative_digit_sum
diff --git a/cpg_flow_test/jobs/build_pyramid.py b/cpg_flow_test/jobs/build_pyramid.py
@@ -0,0 +1,63 @@
+from typing import Any
+
+from cpg_flow.targets.sequencing_group import SequencingGroup
+from hailtop.batch import Batch
+from hailtop.batch.job import Job
+
+
+def build_pyramid(
+    b: Batch,
+    sequencing_groups: list[SequencingGroup],
+    input_files: dict[str, Any],
+    output_file_path: str,
+) -> list[Job]:
+    title = 'Build A Pyramid'
+    # Compute the no evens list for each sequencing group
+    sg_jobs = []
+    sg_output_files = []
+    for sg in sequencing_groups:  # type: ignore
+        job = b.new_job(name=title + ': ' + sg.id)
+        no_evens_input_file_path = input_files[sg.id]['no_evens']
+        no_evens_input_file = b.read_input(no_evens_input_file_path)
+
+        id_sum_input_file_path = input_files[sg.id]['id_sum']
+        id_sum_input_file = b.read_input(id_sum_input_file_path)
+
+        pyramid_output_file_path = str(sg.dataset.prefix() / f'{sg.id}_pyramid.txt')
+        sg_output_files.append(pyramid_output_file_path)
+        cmd = f"""
+            pyramid=()
+            max_row_size=$(cat {no_evens_input_file} | rev | cut -d' ' -f1 | rev)
+            rows=($(cat {no_evens_input_file} | cut -d' ' -f2-))
+            # Add header
+            pyramid+=("Prime Pyramid for {sg.id}")
+            pyramid+=("Generated N: $(cat {id_sum_input_file})")
+
+            for row in "${{rows[@]}}"; do
+                total_spaces=$((max_row_size - row))
+                left_spaces=$((total_spaces / 2))
+                right_spaces=$((total_spaces - left_spaces))
+                pyramid+=("$(printf '%*s' $left_spaces)$(printf '%*s' $row | tr ' ' '*')$(printf '%*s' $right_spaces)")
+                pyramid+=("$(printf '%*s' $left_spaces)$(printf '%*s' $row | tr ' ' '*')$(printf '%*s' $right_spaces)")
+            done
+
+            printf "%s\\n" "${{pyramid[@]}}" > {job.pyramid_file}
+        """
+
+        job.command(cmd)
+        b.write_output(job.pyramid_file, pyramid_output_file_path)
+        sg_jobs.append(job)
+
+    # Merge the no evens lists for all sequencing groups into a single file
+    job = b.new_job(name=title)
+    job.depends_on(*sg_jobs)
+    inputs = ' '.join([b.read_input(f) for f in sg_output_files])
+    job.command(f'cat {inputs} >> {job.pyramid}')
+    b.write_output(job.pyramid, output_file_path)
+
+    print('-----PRINT PYRAMID-----')
+    print(output_file_path)
+
+    all_jobs = [job, *sg_jobs]
+
+    return all_jobs
diff --git a/cpg_flow_test/jobs/cumulative_calc.py b/cpg_flow_test/jobs/cumulative_calc.py
@@ -0,0 +1,33 @@
+from cpg_flow.targets.sequencing_group import SequencingGroup
+from hailtop.batch import Batch
+from hailtop.batch.job import Job
+
+
+def cumulative_calc(
+    b: Batch,
+    sequencing_group: SequencingGroup,
+    input_file_path: str,
+    output_file_path: str,
+) -> list[Job]:
+    title = f'Cumulative Calc: {sequencing_group.id}'
+    job = b.new_job(name=title)
+    primes_path = b.read_input(input_file_path)
+
+    cmd = f"""
+    primes=($(cat {primes_path}))
+    csum=0
+    cumulative=()
+    for prime in "${{primes[@]}}"; do
+        ((csum += prime))
+        cumulative+=("$csum")
+    done
+    echo "${{cumulative[@]}}" > {job.cumulative}
+    """
+
+    job.command(cmd)
+
+    print('-----PRINT CUMULATIVE-----')
+    print(output_file_path)
+    b.write_output(job.cumulative, output_file_path)
+
+    return job
diff --git a/cpg_flow_test/jobs/filter_evens.py b/cpg_flow_test/jobs/filter_evens.py
@@ -0,0 +1,58 @@
+from typing import Any
+
+from cpg_flow.stage import Stage, StageInput
+from cpg_flow.targets.sequencing_group import SequencingGroup
+from hailtop.batch import Batch
+from hailtop.batch.job import Job
+
+
+def filter_evens(
+    b: Batch,
+    inputs: StageInput,
+    previous_stage: Stage,
+    sequencing_groups: list[SequencingGroup],
+    input_files: dict[str, dict[str, Any]],
+    sg_outputs: dict[str, dict[str, Any]],
+    output_file_path: str,
+) -> list[Job]:
+    title = 'Filter Evens'
+
+    # Compute the no evens list for each sequencing group
+    sg_jobs = []
+    sg_output_files = []
+    for sg in sequencing_groups:  # type: ignore
+        job = b.new_job(name=title + ': ' + sg.id)
+        input_file_path = input_files[sg.id]['cumulative']
+        input_file_path = inputs.as_path(sg, previous_stage, 'cumulative')
+        no_evens_input_file = b.read_input(input_file_path)
+        no_evens_output_file_path = str(sg_outputs[sg.id])
+        sg_output_files.append(no_evens_output_file_path)
+
+        cmd = f"""
+        numbers=($(cat {no_evens_input_file}))
+        result=()
+        for num in "${{numbers[@]}}"; do
+            if (( num % 2 != 0 )); then
+                result+=("$num")
+            fi
+        done
+        echo "{sg.id}: ${{result[@]}}" > {job.sg_no_evens_file}
+        """
+
+        job.command(cmd)
+        b.write_output(job.sg_no_evens_file, no_evens_output_file_path)
+        sg_jobs.append(job)
+
+    # Merge the no evens lists for all sequencing groups into a single file
+    job = b.new_job(name=title)
+    job.depends_on(*sg_jobs)
+    inputs = ' '.join([b.read_input(f) for f in sg_output_files])
+    job.command(f'cat {inputs} >> {job.no_evens_file}')
+    b.write_output(job.no_evens_file, output_file_path)
+
+    print('-----PRINT NO EVENS-----')
+    print(output_file_path)
+
+    all_jobs = [job, *sg_jobs]
+
+    return all_jobs
diff --git a/cpg_flow_test/jobs/first_n_primes.py b/cpg_flow_test/jobs/first_n_primes.py
@@ -0,0 +1,51 @@
+from cpg_flow.targets.sequencing_group import SequencingGroup
+from hailtop.batch import Batch
+from hailtop.batch.job import Job
+
+
+def first_n_primes(
+    b: Batch, sequencing_group: SequencingGroup, input_file_path: str, output_file_path: str, depends_on: Job,
+) -> list[Job]:
+    title = f'First N Primes: {sequencing_group.id}'
+    job = b.new_job(name=title)
+    id_sum_path = b.read_input(input_file_path)
+
+    if depends_on:
+        job.depends_on(depends_on)
+
+    cmd = f"""
+    is_prime() {{
+        local num=$1
+        if [ $num -lt 2 ]; then
+            echo 0
+            return
+        fi
+        for ((i=2; i*i<=$num; i++)); do
+            if [ $(($num % $i)) -eq 0 ]; then
+                echo 0
+                return
+            fi
+        done
+        echo 1
+    }}
+
+    n=$(cat {id_sum_path})  # Replace with the desired number of primes
+    primes=()
+    candidate=2
+    while [ ${{#primes[@]}} -lt $n ]; do
+        if [ $(is_prime $candidate) -eq 1 ]; then
+            primes+=($candidate)
+        fi
+        candidate=$((candidate + 1))
+    done
+
+    echo ${{primes[@]}} > {job.primes}
+    """
+
+    job.command(cmd)
+
+    print('-----PRINT PRIMES-----')
+    print(output_file_path)
+    b.write_output(job.primes, output_file_path)
+
+    return job
Original file line number	Diff line number	Diff line change
Expand Up		@@ -174,3 +174,5 @@ poetry.toml
		pyrightconfig.json

		# End of https://www.toptal.com/developers/gitignore/api/python

		.vscode/