chanzuckerberg · vincent-czi · Sep 5, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/src/backend/Dockerfile.nextstrain b/src/backend/Dockerfile.nextstrain
@@ -56,13 +56,14 @@ RUN mkdir /ncov && \
 
 # Add support for our custom mpox workflow
 # TODO [Vincent & Dan; Aug 2024]: Update the `mpox` workflow content.
+# TODO Convert to a specifc commit rather than overall subsample_by_distance branch
 RUN mkdir /mpox && \
     cd /mpox && \
     git init && \
-    git remote add origin https://github.com/chanzuckerberg/monkeypox.git && \
-    git fetch origin subsampling && \
-    git reset --hard fd74f4b5f219035c9cbb7909b6f84f8a06fda76d
-RUN chown nextstrain:nextstrain /mpox/config/exclude_accessions_mpxv.txt /mpox/config/clades.tsv
+    git remote add origin https://github.com/chanzuckerberg/mpox.git && \
+    git fetch origin subsample_by_distance && \
+    git reset --hard origin/subsample_by_distance
+RUN chown nextstrain:nextstrain /mpox/phylogenetic/defaults/exclude_accessions.txt /mpox/phylogenetic/defaults/clades.tsv
 
 RUN mkdir -p /ncov/auspice
 RUN mkdir -p /ncov/logs
@@ -83,7 +84,8 @@ COPY . .
 # Install the aspen package
 RUN poetry install
 RUN chmod a+w /ncov/auspice /ncov/logs
-RUN chmod a+w /mpox/ /mpox/config
+# [Vince] I'm not totally sure we need all three of these, but let's start with them.
+RUN chmod a+w /mpox/ /mpox/phylogenetic /mpox/phylogenetic/defaults
 
 # TODO - Mismatch between poetry and augur deps forces us to manually install jsonschema v3 here
 RUN pip install jsonschema==3.*

diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py
@@ -14,14 +14,9 @@ def update_config(self, config):
 
 class MPXPlugin(PathogenPlugin):
     def update_config(self, config):
-        build_config = {}
-        try:
-            build_config = config["builds"]["aspen"]
-            config["subsampling_scheme"] = build_config["subsampling_scheme"]
-            del config["builds"]
-        except KeyError:
-            pass
-        subsampling_scheme = config["subsampling_scheme"]
+        build_config = config["builds"]["aspen"]
+        subsampling_scheme = build_config["subsampling_scheme"]
+        # Create escaped single quotes for interpolation into `--query` sections.
         escaped_config = {}
         for k, v in build_config.items():
             if type(v) == str:
@@ -31,7 +26,3 @@ def update_config(self, config):
         for _, sample in config["subsampling"][subsampling_scheme].items():
             if sample.get("query"):
                 sample["query"] = sample["query"].format(**escaped_config)
-            if sample.get("max_sequences"):
-                sample["subsample-max-sequences"] = sample["max_sequences"]
-                del sample["max_sequences"]
-        config["subsampling"] = config["subsampling"][subsampling_scheme]
diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -1,21 +1,33 @@
-subsampling_scheme: "OVERVIEW"
-
-exclude: "config/exclude_accessions_mpxv.txt"
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-colors: "config/colors_mpxv.tsv"
-lat_longs: "config/lat_longs.tsv"
-auspice_config: "config/auspice_config_mpxv.json"
-description: "config/description.md"
-clades: "config/clades.tsv"
-tree_mask: "config/tree_mask.tsv"
+builds:
+  aspen:
+    region: global
+    country: {country}
+    division: {division}
+    location: {location}
+    subsampling_scheme: {tree_type}
+    title: CZ Gen Epi MPX Tree
+
+# make sure build_name matches the sub-key of `builds` above.
+# Not sure how to consolidate it, but not going to try to figure it out now.
+build_name: "aspen"
+auspice_name: "monkeypox_mpxv"
+
+reference: "defaults/reference.fasta"
+genome_annotation: "defaults/genome_annotation.gff3"
+genbank_reference: "defaults/reference.gb"
+include: "data/include.txt"  # add this to our template
+clades: "defaults/clades.tsv"
+lat_longs: "defaults/lat_longs.tsv"
+auspice_config: "defaults/legacy_auspice_config_mpxv.json"
+description: "defaults/description.md"
+tree_mask: "defaults/tree_mask.tsv"
 
 strain_id_field: "accession"
-display_strain_field: "strain_original"
+display_strain_field: "strain"
 
-build_name: "mpxv"
-auspice_name: "monkeypox_mpxv"
+filter:
+  min_date: 1950
+  min_length: 100000
 
 ## align
 max_indel: 10000
@@ -29,110 +41,106 @@ timetree: false
 root: "min_dev"
 clock_rate: 3e-6
 clock_std_dev: 6e-6
+divergence_units: "mutations"
+
+traits:
+  columns: ""
+  sampling_bias_correction: 3
 
 ## recency
 recency: true
 
 mask:
   from_beginning: 1350
   from_end: 6422
-  maskfile: "config/mask_overview.bed"
+  maskfile: "defaults/mask_overview.bed"
+
+priorities:
+  crowding_penalty: 0 # Gets set by treetype during `export.py`
 
 ## Subsampling schemas
 subsampling:
 
   OVERVIEW:
     group:
-      subsample-max-sequences: 500
-      query: "(location == '{location}') & (division == '{division}')" 
-      min-length: 100000
-
+      max_sequences: 500
+      query: --query "(location == '{location}') & (division == '{division}')"
+
     state:
-      subsample-max-sequences: 300
-      query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
+      max_sequences: 300
+      query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
         type: "proximity"
         focus: "group"
-      min-length: 100000
 
     country:
-      subsample-max-sequences: 300
-      query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
+      max_sequences: 300
+      query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
         type: "proximity"
         focus: "group"
-      min-length: 100000
-
+
     international:
-      subsample-max-sequences: 300
-      query: "(country != '{country}')" # this should capture samples that have no division or location info
+      max_sequences: 300
+      query: --query "(country != '{country}')" # exclude add'l samples from USA
       priorities:
           type: "proximity"
           focus: "group"
-      min-length: 100000
-
+
     international_serial_sampling:
-      group-by: ["region", "year"]  # lots of samples have no "month" so in order to include them, we'll only go by "year"
-      sequences-per-group: 2
-      query: "(country != '{country}')"
-      min-length: 100000
+      group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year"
+      seq_per_group: 2
+      query: --query "(country != '{country}')"
 
+  ########################
 
   TARGETED:
     focal:
-      exclude-all: true
-      
+      exclude: "--exclude-all"
+
     closest:
-      subsample-max-sequences: 100   # this changes with number of samples in include.txt and that's good
+      max_sequences: 100
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
 
     group:
-      subsample-max-sequences: 25
-      query: "(location == '{location}') & (division == '{division}')"
+      max_sequences: 25
+      query: --query "(location == '{location}') & (division == '{division}')"
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
-
+
     state:
-      subsample-max-sequences: 25
-      query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
+      max_sequences: 25
+      query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
 
     country:
-      subsample-max-sequences: 25
-      query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
+      max_sequences: 25
+      query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
-
+
     international:
-      subsample-max-sequences: 25
-      query: "(country != '{country}')" # this should capture samples that have no division or location info
+      max_sequences: 25
+      query: --query "(country != '{country}')" # [Vince] huh? Original comment: "exclude add'l samples from USA"
       priorities:
-          type: "proximity"
-          focus: "focal"
-      min-length: 100000
-
+        type: "proximity"
+        focus: "focal"
+
     international_serial_sampling:
-      group-by: ["region", "year"]  # lots of samples have no "month" so in order to include them, we'll only go by "year"
-      sequences-per-group: 2
-      query: "(country != '{country}')"
-      min-length: 100000
-
-
+      group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year"
+      seq_per_group: 2
+      query: --query "(country != '{country}')"
+
+  ########################
 
   NON_CONTEXTUALIZED:
     group:
-      group-by:
-        - "year"
-      subsample-max-sequences: 1000
-      query: "(location == '{location}') & (division == '{division}')"
-      min-length: 100000
+      group_by: "year"
+      max_sequences: 1000
+      query: --query "(location == '{location}') & (division == '{division}')"
diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py
@@ -153,6 +153,9 @@ def cli(
 
 
 # For local debugging of our yaml building process.
+# Would be better to re-structure the main `export_run_config` process so yaml
+# output happens earlier and we just exit early if --builds-file-only flag is
+# on rather than having a separate code path for that flag being on.
 def dump_yaml_template(
     phylo_run_id: int,
     builds_file_fh: io.TextIOWrapper,
@@ -176,7 +179,11 @@ def dump_yaml_template(
             session, phylo_run.pathogen, phylo_run.template_args, group
         )
         builder: TemplateBuilder = TemplateBuilder(
-            phylo_run.tree_type, phylo_run.group, resolved_template_args, **context
+            phylo_run.tree_type,
+            phylo_run.pathogen,
+            phylo_run.group,
+            resolved_template_args,
+            **context,
         )
         builder.write_file(builds_file_fh)
 

diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
@@ -33,10 +33,10 @@ set -x
 
 # Download the latest mpox exclusions and clades list. This happens at RUN time, not BUILD time so that
 # we are always building trees with the latest upstream filters.
-wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/config/exclude_accessions_mpxv.txt
-wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/config/clades.tsv
+wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/phylogenetic/defaults/exclude_accessions.txt
+wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/phylogenetic/defaults/clades.tsv
 
-mkdir -p /mpox/data
+mkdir -p /mpox/phylogenetic/data
 key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}"
 s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}"
 
@@ -52,12 +52,12 @@ mpox_git_rev=$(cd /mpox && git rev-parse HEAD)
 aligned_upstream_location=$(
     python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py        \
            --phylo-run-id "${WORKFLOW_ID}"                               \
-           --sequences /mpox/data/sequences_czge.fasta                   \
-           --metadata /mpox/data/metadata_czge.tsv                       \
-           --selected /mpox/data/include.txt                             \
+           --sequences /mpox/phylogenetic/data/sequences_czge.fasta      \
+           --metadata /mpox/phylogenetic/data/metadata_czge.tsv          \
+           --selected /mpox/phylogenetic/data/include.txt                \
            --sequence-type aligned                                       \
            --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}" \
-           --builds-file /mpox/config/build_czge.yaml                    \
+           --builds-file /mpox/phylogenetic/build_czge.yaml              \
            --reset-status
 )
 
@@ -66,33 +66,33 @@ aligned_upstream_sequences_s3_key=$(echo "${aligned_upstream_location}" | jq -r
 aligned_upstream_metadata_s3_key=$(echo "${aligned_upstream_location}" | jq -r .metadata_key)
 
 # fetch the upstream dataset
-if [ ! -e /mpox/data/upstream_sequences.fasta ]; then
-    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/data/upstream_sequences.fasta.xz
-    unxz /mpox/data/*.xz
+if [ ! -e /mpox/phylogenetic/data/upstream_sequences.fasta ]; then
+    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/phylogenetic/data/upstream_sequences.fasta.xz
+    unxz /mpox/phylogenetic/data/*.xz
 fi
-if [ ! -e /mpox/data/upstream_metadata.tsv ]; then
-    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/data/upstream_metadata.tsv.xz
-    unxz /mpox/data/*.xz
+if [ ! -e /mpox/phylogenetic/data/upstream_metadata.tsv ]; then
+    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/phylogenetic/data/upstream_metadata.tsv.xz
+    unxz /mpox/phylogenetic/data/*.xz
 fi
 
 # If we've written out any samples, add them to the upstream metadata/fasta files
-if [ -e /mpox/data/sequences_czge.fasta ]; then
-    python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/data/metadata_czge.tsv --required-sequences /mpox/data/sequences_czge.fasta --upstream-metadata /mpox/data/upstream_metadata.tsv --upstream-sequences /mpox/data/upstream_sequences.fasta --destination-metadata /mpox/data/metadata.tsv --destination-sequences /mpox/data/sequences.fasta --required-match-column strain --upstream-match-column accession
+if [ -e /mpox/phylogenetic/data/sequences_czge.fasta ]; then
+    python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/phylogenetic/data/metadata_czge.tsv --required-sequences /mpox/phylogenetic/data/sequences_czge.fasta --upstream-metadata /mpox/phylogenetic/data/upstream_metadata.tsv --upstream-sequences /mpox/phylogenetic/data/upstream_sequences.fasta --destination-metadata /mpox/phylogenetic/data/metadata.tsv --destination-sequences /mpox/phylogenetic/data/sequences.fasta --required-match-column strain --upstream-match-column accession
 else
-    cp /mpox/data/upstream_metadata.tsv /mpox/data/metadata.tsv
-    cp /mpox/data/upstream_sequences.fasta /mpox/data/sequences.fasta
+    cp /mpox/phylogenetic/data/upstream_metadata.tsv /mpox/phylogenetic/data/metadata.tsv
+    cp /mpox/phylogenetic/data/upstream_sequences.fasta /mpox/phylogenetic/data/sequences.fasta
 fi;
 
 # Persist the build config we generated.
-$aws s3 cp /mpox/config/build_czge.yaml "${s3_prefix}/build_czge.yaml"
-$aws s3 cp /mpox/data/include.txt "${s3_prefix}/include.txt"
+$aws s3 cp /mpox/phylogenetic/build_czge.yaml "${s3_prefix}/build_czge.yaml"
+$aws s3 cp /mpox/phylogenetic/data/include.txt "${s3_prefix}/include.txt"
 
 # run snakemake, if run fails export the logs from snakemake to s3
-(cd /mpox && snakemake --printshellcmds --configfile config/build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; }
+(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/aspen/logs/ "${s3_prefix}/logs/mpox/" --recursive ; }
 
 # upload the tree to S3. The variable key is created to use later
 key="${key_prefix}/mpx_czge.json"
-$aws s3 cp /mpox/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}"
+$aws s3 cp /mpox/phylogenetic/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}"
 
 # update aspen
 aspen_workflow_rev=WHATEVER
@@ -111,4 +111,4 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py                 \
     --bucket "${aspen_s3_db_bucket}"                                        \
     --key "${key}"                                                          \
     --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}"           \
-    --tree-path /mpox/auspice/monkeypox_mpxv.json                           \
+    --tree-path /mpox/phylogenetic/auspice/monkeypox_mpxv.json              \