From 222b6b5baf30c6deca5504630eda0ec539fdf16a Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Fri, 30 Aug 2024 11:41:49 -0700
Subject: [PATCH 01/13] Fix yaml dump only to have expected args

---
 src/backend/aspen/workflows/nextstrain_run/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py
index a6872c60f..1e00c5456 100644
--- a/src/backend/aspen/workflows/nextstrain_run/export.py
+++ b/src/backend/aspen/workflows/nextstrain_run/export.py
@@ -176,7 +176,7 @@ def dump_yaml_template(
             session, phylo_run.pathogen, phylo_run.template_args, group
         )
         builder: TemplateBuilder = TemplateBuilder(
-            phylo_run.tree_type, phylo_run.group, resolved_template_args, **context
+            phylo_run.tree_type, phylo_run.pathogen, phylo_run.group, resolved_template_args, **context
         )
         builder.write_file(builds_file_fh)
 

From ffff0b889d0706694ec03d52e8d506170abffb76 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Fri, 30 Aug 2024 14:11:56 -0700
Subject: [PATCH 02/13] WIP: Alter base MPX template for new mpox flow

---
 .../builds_templates/MPX_template.yaml        | 152 ++++++++++--------
 .../aspen/workflows/nextstrain_run/export.py  |   3 +
 2 files changed, 87 insertions(+), 68 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
index 97a3bda20..cbf0fe02b 100644
--- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
+++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -1,21 +1,33 @@
-subsampling_scheme: "OVERVIEW"
-
-exclude: "config/exclude_accessions_mpxv.txt"
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-colors: "config/colors_mpxv.tsv"
-lat_longs: "config/lat_longs.tsv"
-auspice_config: "config/auspice_config_mpxv.json"
-description: "config/description.md"
-clades: "config/clades.tsv"
-tree_mask: "config/tree_mask.tsv"
+builds:
+  aspen_mpx:
+    region: global
+    country: {country}
+    division: {division}
+    location: {location}
+    subsampling_scheme: {tree_type}
+    title: CZ Gen Epi MPX Tree  # VOODOO verify gets replaced with a more specific title in builder_base.py
+
+# make sure build_name matches the sub-key of `builds` above.
+# Not sure how to consolidate it, but not going to try to figure it out now.
+build_name: "aspen_mpx"
+auspice_name: "monkeypox_mpxv"  # VOODOO Unclear if this gets used or will be overwritten
+
+reference: "defaults/reference.fasta"
+genome_annotation: "defaults/genome_annotation.gff3"
+genbank_reference: "defaults/reference.gb"
+include: "data/include.txt"  # add this to our template
+clades: "defaults/clades.tsv"
+lat_longs: "defaults/lat_longs.tsv"
+auspice_config: "defaults/hmpxv1/auspice_config.json" # replace this with our own
+description: "defaults/description.md" # replace this with our own
+tree_mask: "defaults/tree_mask.tsv"
 
 strain_id_field: "accession"
-display_strain_field: "strain_original"
+display_strain_field: "strain"
 
-build_name: "mpxv"
-auspice_name: "monkeypox_mpxv"
+filter:
+  min_date: 1950
+  min_length: 100000
 
 ## align
 max_indel: 10000
@@ -25,10 +37,15 @@ seed_spacing: 1000
 fix_tree: true
 
 ## refine
-timetree: false
+timetree: true
 root: "min_dev"
 clock_rate: 3e-6
 clock_std_dev: 6e-6
+divergence_units: "mutations"
+
+traits:
+  columns: ""
+  sampling_bias_correction: 3
 
 ## recency
 recency: true
@@ -36,103 +53,102 @@ recency: true
 mask:
   from_beginning: 1350
   from_end: 6422
-  maskfile: "config/mask_overview.bed"
+  maskfile: "defaults/mask_overview.bed"
+
+priorities:
+  crowding_penalty: 0
 
 ## Subsampling schemas
 subsampling:
 
   OVERVIEW:
     group:
-      subsample-max-sequences: 500
-      query: "(location == '{location}') & (division == '{division}')" 
-      min-length: 100000
-      
+      group_by: "year month"
+      max_sequences: 2000
+      query: --query "(location == '{location}') & (division == '{division}')"
+
     state:
-      subsample-max-sequences: 300
-      query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
+      group_by: "location year month"
+      max_sequences: 500
+      query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
         type: "proximity"
         focus: "group"
-      min-length: 100000
 
     country:
-      subsample-max-sequences: 300
-      query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
+      group_by: "division year month"
+      max_sequences: 400
+      query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
         type: "proximity"
         focus: "group"
-      min-length: 100000
-      
+
     international:
-      subsample-max-sequences: 300
-      query: "(country != '{country}')" # this should capture samples that have no division or location info
+      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      max_sequences: 100
+      query: --query "(country != '{country}')" # exclude add'l samples from USA
       priorities:
           type: "proximity"
           focus: "group"
-      min-length: 100000
-      
+
     international_serial_sampling:
-      group-by: ["region", "year"]  # lots of samples have no "month" so in order to include them, we'll only go by "year"
-      sequences-per-group: 2
-      query: "(country != '{country}')"
-      min-length: 100000
+      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      seq_per_group: 2
+      query: --query "(country != '{country}')"
 
+  ########################
 
   TARGETED:
     focal:
-      exclude-all: true
-      
+      exclude: "--exclude-all"
+
     closest:
-      subsample-max-sequences: 100   # this changes with number of samples in include.txt and that's good
+      max_sequences: 100
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
 
     group:
-      subsample-max-sequences: 25
-      query: "(location == '{location}') & (division == '{division}')"
+      group_by: "year month"
+      max_sequences: 25
+      query: --query "(location == '{location}') & (division == '{division}')"
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
-       
+
     state:
-      subsample-max-sequences: 25
-      query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
+      group_by: "location year month"
+      max_sequences: 25
+      query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
 
     country:
-      subsample-max-sequences: 25
-      query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
+      group_by: "division year month"
+      max_sequences: 25
+      query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
         type: "proximity"
         focus: "focal"
-      min-length: 100000
-      
+
     international:
-      subsample-max-sequences: 25
-      query: "(country != '{country}')" # this should capture samples that have no division or location info
+      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      max_sequences: 25
+      query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA"
       priorities:
-          type: "proximity"
-          focus: "focal"
-      min-length: 100000
-      
+        type: "proximity"
+        focus: "focal"
+
     international_serial_sampling:
-      group-by: ["region", "year"]  # lots of samples have no "month" so in order to include them, we'll only go by "year"
-      sequences-per-group: 2
-      query: "(country != '{country}')"
-      min-length: 100000
-      
-      
+      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      seq_per_group: 2
+      query: --query "(country != '{country}')"
+
+  ########################
 
   NON_CONTEXTUALIZED:
     group:
-      group-by:
-        - "year"
-      subsample-max-sequences: 1000
-      query: "(location == '{location}') & (division == '{division}')"
-      min-length: 100000
+      group_by: "year month" # VOODOO: old MPX did just year, not year+month
+      max_sequences: 2000
+      query: --query "(location == '{location}') & (division == '{division}')"
diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py
index 1e00c5456..ffc522d2c 100644
--- a/src/backend/aspen/workflows/nextstrain_run/export.py
+++ b/src/backend/aspen/workflows/nextstrain_run/export.py
@@ -153,6 +153,9 @@ def cli(
 
 
 # For local debugging of our yaml building process.
+# Would be better to re-structure the main `export_run_config` process so yaml
+# output happens earlier and we just exit early if --builds-file-only flag is
+# on rather than having a separate code path for that flag being on.
 def dump_yaml_template(
     phylo_run_id: int,
     builds_file_fh: io.TextIOWrapper,

From b536f2d1182b4783ec70c760ac5acf52a9828424 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Fri, 30 Aug 2024 16:54:16 -0700
Subject: [PATCH 03/13] WIP: Address notes from talking with Dan

---
 .../builds_templates/MPX_template.yaml        | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
index cbf0fe02b..cdcbcc914 100644
--- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
+++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -1,5 +1,5 @@
 builds:
-  aspen_mpx:
+  aspen:
     region: global
     country: {country}
     division: {division}
@@ -9,7 +9,7 @@ builds:
 
 # make sure build_name matches the sub-key of `builds` above.
 # Not sure how to consolidate it, but not going to try to figure it out now.
-build_name: "aspen_mpx"
+build_name: "aspen"
 auspice_name: "monkeypox_mpxv"  # VOODOO Unclear if this gets used or will be overwritten
 
 reference: "defaults/reference.fasta"
@@ -18,8 +18,8 @@ genbank_reference: "defaults/reference.gb"
 include: "data/include.txt"  # add this to our template
 clades: "defaults/clades.tsv"
 lat_longs: "defaults/lat_longs.tsv"
-auspice_config: "defaults/hmpxv1/auspice_config.json" # replace this with our own
-description: "defaults/description.md" # replace this with our own
+auspice_config: "defaults/hmpxv1/auspice_config.json" # VOODOO port over Jess' version of auspice_config.json
+description: "defaults/description.md" # VOODOO Port over Jess' current description, Dan will follow up with more edits
 tree_mask: "defaults/tree_mask.tsv"
 
 strain_id_field: "accession"
@@ -37,7 +37,7 @@ seed_spacing: 1000
 fix_tree: true
 
 ## refine
-timetree: true
+timetree: false
 root: "min_dev"
 clock_rate: 3e-6
 clock_std_dev: 6e-6
@@ -56,19 +56,17 @@ mask:
   maskfile: "defaults/mask_overview.bed"
 
 priorities:
-  crowding_penalty: 0
+  crowding_penalty: 0 # Gets set by treetype during `export.py`
 
 ## Subsampling schemas
 subsampling:
 
   OVERVIEW:
     group:
-      group_by: "year month"
       max_sequences: 2000
       query: --query "(location == '{location}') & (division == '{division}')"
 
     state:
-      group_by: "location year month"
       max_sequences: 500
       query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
@@ -76,7 +74,6 @@ subsampling:
         focus: "group"
 
     country:
-      group_by: "division year month"
       max_sequences: 400
       query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
@@ -84,7 +81,6 @@ subsampling:
         focus: "group"
 
     international:
-      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
       max_sequences: 100
       query: --query "(country != '{country}')" # exclude add'l samples from USA
       priorities:
@@ -92,7 +88,7 @@ subsampling:
           focus: "group"
 
     international_serial_sampling:
-      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year"
       seq_per_group: 2
       query: --query "(country != '{country}')"
 
@@ -109,7 +105,6 @@ subsampling:
         focus: "focal"
 
     group:
-      group_by: "year month"
       max_sequences: 25
       query: --query "(location == '{location}') & (division == '{division}')"
       priorities:
@@ -117,7 +112,6 @@ subsampling:
         focus: "focal"
 
     state:
-      group_by: "location year month"
       max_sequences: 25
       query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
@@ -125,7 +119,6 @@ subsampling:
         focus: "focal"
 
     country:
-      group_by: "division year month"
       max_sequences: 25
       query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
@@ -133,7 +126,6 @@ subsampling:
         focus: "focal"
 
     international:
-      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
       max_sequences: 25
       query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA"
       priorities:
@@ -141,7 +133,7 @@ subsampling:
         focus: "focal"
 
     international_serial_sampling:
-      group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year"
+      group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year"
       seq_per_group: 2
       query: --query "(country != '{country}')"
 
@@ -149,6 +141,6 @@ subsampling:
 
   NON_CONTEXTUALIZED:
     group:
-      group_by: "year month" # VOODOO: old MPX did just year, not year+month
+      group_by: "year"
       max_sequences: 2000
       query: --query "(location == '{location}') & (division == '{division}')"

From 23c499484d8f7ac5c48cc48f3e3fcc33bd3dd151 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Fri, 30 Aug 2024 16:55:40 -0700
Subject: [PATCH 04/13] WIP: Adjust `max_sequences` to match old MPX template

---
 .../nextstrain_run/builds_templates/MPX_template.yaml  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
index cdcbcc914..8da8d7cfe 100644
--- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
+++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -63,25 +63,25 @@ subsampling:
 
   OVERVIEW:
     group:
-      max_sequences: 2000
+      max_sequences: 500
       query: --query "(location == '{location}') & (division == '{division}')"
 
     state:
-      max_sequences: 500
+      max_sequences: 300
       query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location}
       priorities:
         type: "proximity"
         focus: "group"
 
     country:
-      max_sequences: 400
+      max_sequences: 300
       query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA
       priorities:
         type: "proximity"
         focus: "group"
 
     international:
-      max_sequences: 100
+      max_sequences: 300
       query: --query "(country != '{country}')" # exclude add'l samples from USA
       priorities:
           type: "proximity"
@@ -142,5 +142,5 @@ subsampling:
   NON_CONTEXTUALIZED:
     group:
       group_by: "year"
-      max_sequences: 2000
+      max_sequences: 1000
       query: --query "(location == '{location}') & (division == '{division}')"

From 82cdca1bae178779afe7119bb5f2f26c162c6245 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Tue, 3 Sep 2024 16:03:54 -0700
Subject: [PATCH 05/13] WIP: Keep `builds` in MPX output template

---
 .../build_plugins/pathogen_plugins.py             | 15 +++------------
 .../builds_templates/MPX_template.yaml            |  4 ++--
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py
index 23f8b0fd2..f42e9b6b7 100644
--- a/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py
+++ b/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py
@@ -14,14 +14,9 @@ def update_config(self, config):
 
 class MPXPlugin(PathogenPlugin):
     def update_config(self, config):
-        build_config = {}
-        try:
-            build_config = config["builds"]["aspen"]
-            config["subsampling_scheme"] = build_config["subsampling_scheme"]
-            del config["builds"]
-        except KeyError:
-            pass
-        subsampling_scheme = config["subsampling_scheme"]
+        build_config = config["builds"]["aspen"]
+        subsampling_scheme = build_config["subsampling_scheme"]
+        # Create escaped single quotes for interpolation into `--query` sections.
         escaped_config = {}
         for k, v in build_config.items():
             if type(v) == str:
@@ -31,7 +26,3 @@ def update_config(self, config):
         for _, sample in config["subsampling"][subsampling_scheme].items():
             if sample.get("query"):
                 sample["query"] = sample["query"].format(**escaped_config)
-            if sample.get("max_sequences"):
-                sample["subsample-max-sequences"] = sample["max_sequences"]
-                del sample["max_sequences"]
-        config["subsampling"] = config["subsampling"][subsampling_scheme]
diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
index 8da8d7cfe..57fab73bc 100644
--- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
+++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -5,12 +5,12 @@ builds:
     division: {division}
     location: {location}
     subsampling_scheme: {tree_type}
-    title: CZ Gen Epi MPX Tree  # VOODOO verify gets replaced with a more specific title in builder_base.py
+    title: CZ Gen Epi MPX Tree
 
 # make sure build_name matches the sub-key of `builds` above.
 # Not sure how to consolidate it, but not going to try to figure it out now.
 build_name: "aspen"
-auspice_name: "monkeypox_mpxv"  # VOODOO Unclear if this gets used or will be overwritten
+auspice_name: "monkeypox_mpxv"
 
 reference: "defaults/reference.fasta"
 genome_annotation: "defaults/genome_annotation.gff3"

From b2c5b9ef102db04f478712bc84979a5179454189 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Wed, 4 Sep 2024 15:12:28 -0700
Subject: [PATCH 06/13] Remove no longer needed comments

---
 .../nextstrain_run/builds_templates/MPX_template.yaml       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
index 57fab73bc..9f134dc25 100644
--- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
+++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml
@@ -18,8 +18,8 @@ genbank_reference: "defaults/reference.gb"
 include: "data/include.txt"  # add this to our template
 clades: "defaults/clades.tsv"
 lat_longs: "defaults/lat_longs.tsv"
-auspice_config: "defaults/hmpxv1/auspice_config.json" # VOODOO port over Jess' version of auspice_config.json
-description: "defaults/description.md" # VOODOO Port over Jess' current description, Dan will follow up with more edits
+auspice_config: "defaults/legacy_auspice_config_mpxv.json"
+description: "defaults/description.md"
 tree_mask: "defaults/tree_mask.tsv"
 
 strain_id_field: "accession"
@@ -127,7 +127,7 @@ subsampling:
 
     international:
       max_sequences: 25
-      query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA"
+      query: --query "(country != '{country}')" # [Vince] huh? Original comment: "exclude add'l samples from USA"
       priorities:
         type: "proximity"
         focus: "focal"

From 7ca2fafc8217c998d0bbd19e9b4ab21d85292f9e Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Wed, 4 Sep 2024 15:27:43 -0700
Subject: [PATCH 07/13] Convert to being compatible with new mpox build

---
 src/backend/Dockerfile.nextstrain | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/backend/Dockerfile.nextstrain b/src/backend/Dockerfile.nextstrain
index d0f0e136b..cc4750f1b 100644
--- a/src/backend/Dockerfile.nextstrain
+++ b/src/backend/Dockerfile.nextstrain
@@ -56,13 +56,14 @@ RUN mkdir /ncov && \
 
 # Add support for our custom mpox workflow
 # TODO [Vincent & Dan; Aug 2024]: Update the `mpox` workflow content.
+# TODO Convert to a specifc commit rather than overall subsample_by_distance branch
 RUN mkdir /mpox && \
     cd /mpox && \
     git init && \
-    git remote add origin https://github.com/chanzuckerberg/monkeypox.git && \
-    git fetch origin subsampling && \
-    git reset --hard fd74f4b5f219035c9cbb7909b6f84f8a06fda76d
-RUN chown nextstrain:nextstrain /mpox/config/exclude_accessions_mpxv.txt /mpox/config/clades.tsv
+    git remote add origin https://github.com/chanzuckerberg/mpox.git && \
+    git fetch origin subsample_by_distance && \
+    git reset --hard origin/subsample_by_distance
+RUN chown nextstrain:nextstrain /mpox/phylogenetic/defaults/exclude_accessions.txt /mpox/phylogenetic/defaults/clades.tsv
 
 RUN mkdir -p /ncov/auspice
 RUN mkdir -p /ncov/logs
@@ -83,7 +84,8 @@ COPY . .
 # Install the aspen package
 RUN poetry install
 RUN chmod a+w /ncov/auspice /ncov/logs
-RUN chmod a+w /mpox/ /mpox/config
+# [Vince] I'm not totally sure we need all three of these, but let's start with them.
+RUN chmod a+w /mpox/ /mpox/phylogenetic /mpox/phylogenetic/defaults
 
 # TODO - Mismatch between poetry and augur deps forces us to manually install jsonschema v3 here
 RUN pip install jsonschema==3.*

From 5ddc542df652e91edefc56237d1d83c5af6506f9 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Wed, 4 Sep 2024 16:31:04 -0700
Subject: [PATCH 08/13] Modify paths to use latest mpox workflow format

---
 .../nextstrain_run/run_nextstrain_mpx.sh      | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
index 5b1a0e0e3..e77ff685f 100755
--- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
+++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
@@ -33,10 +33,10 @@ set -x
 
 # Download the latest mpox exclusions and clades list. This happens at RUN time, not BUILD time so that
 # we are always building trees with the latest upstream filters.
-wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/config/exclude_accessions_mpxv.txt
-wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/config/clades.tsv
+wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/phylogenetic/defaults/exclude_accessions.txt
+wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/phylogenetic/defaults/clades.tsv
 
-mkdir -p /mpox/data
+mkdir -p /mpox/phylogenetic/data
 key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}"
 s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}"
 
@@ -52,12 +52,12 @@ mpox_git_rev=$(cd /mpox && git rev-parse HEAD)
 aligned_upstream_location=$(
     python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py        \
            --phylo-run-id "${WORKFLOW_ID}"                               \
-           --sequences /mpox/data/sequences_czge.fasta                   \
-           --metadata /mpox/data/metadata_czge.tsv                       \
-           --selected /mpox/data/include.txt                             \
+           --sequences /mpox/phylogenetic/data/sequences_czge.fasta      \
+           --metadata /mpox/phylogenetic/data/metadata_czge.tsv          \
+           --selected /mpox/phylogenetic/data/include.txt                \
            --sequence-type aligned                                       \
            --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}" \
-           --builds-file /mpox/config/build_czge.yaml                    \
+           --builds-file /mpox/phylogenetic/build_czge.yaml              \
            --reset-status
 )
 
@@ -66,33 +66,33 @@ aligned_upstream_sequences_s3_key=$(echo "${aligned_upstream_location}" | jq -r
 aligned_upstream_metadata_s3_key=$(echo "${aligned_upstream_location}" | jq -r .metadata_key)
 
 # fetch the upstream dataset
-if [ ! -e /mpox/data/upstream_sequences.fasta ]; then
-    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/data/upstream_sequences.fasta.xz
-    unxz /mpox/data/*.xz
+if [ ! -e /mpox/phylogenetic/data/upstream_sequences.fasta ]; then
+    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/phylogenetic/data/upstream_sequences.fasta.xz
+    unxz /mpox/phylogenetic/data/*.xz
 fi
-if [ ! -e /mpox/data/upstream_metadata.tsv ]; then
-    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/data/upstream_metadata.tsv.xz
-    unxz /mpox/data/*.xz
+if [ ! -e /mpox/phylogenetic/data/upstream_metadata.tsv ]; then
+    $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/phylogenetic/data/upstream_metadata.tsv.xz
+    unxz /mpox/phylogenetic/data/*.xz
 fi
 
 # If we've written out any samples, add them to the upstream metadata/fasta files
-if [ -e /mpox/data/sequences_czge.fasta ]; then
-    python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/data/metadata_czge.tsv --required-sequences /mpox/data/sequences_czge.fasta --upstream-metadata /mpox/data/upstream_metadata.tsv --upstream-sequences /mpox/data/upstream_sequences.fasta --destination-metadata /mpox/data/metadata.tsv --destination-sequences /mpox/data/sequences.fasta --required-match-column strain --upstream-match-column accession
+if [ -e /mpox/phylogenetic/data/sequences_czge.fasta ]; then
+    python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/phylogenetic/data/metadata_czge.tsv --required-sequences /mpox/phylogenetic/data/sequences_czge.fasta --upstream-metadata /mpox/phylogenetic/data/upstream_metadata.tsv --upstream-sequences /mpox/phylogenetic/data/upstream_sequences.fasta --destination-metadata /mpox/phylogenetic/data/metadata.tsv --destination-sequences /mpox/phylogenetic/data/sequences.fasta --required-match-column strain --upstream-match-column accession
 else
-    cp /mpox/data/upstream_metadata.tsv /mpox/data/metadata.tsv
-    cp /mpox/data/upstream_sequences.fasta /mpox/data/sequences.fasta
+    cp /mpox/phylogenetic/data/upstream_metadata.tsv /mpox/phylogenetic/data/metadata.tsv
+    cp /mpox/phylogenetic/data/upstream_sequences.fasta /mpox/phylogenetic/data/sequences.fasta
 fi;
 
 # Persist the build config we generated.
-$aws s3 cp /mpox/config/build_czge.yaml "${s3_prefix}/build_czge.yaml"
-$aws s3 cp /mpox/data/include.txt "${s3_prefix}/include.txt"
+$aws s3 cp /mpox/phylogenetic/build_czge.yaml "${s3_prefix}/build_czge.yaml"
+$aws s3 cp /mpox/phylogenetic/data/include.txt "${s3_prefix}/include.txt"
 
 # run snakemake, if run fails export the logs from snakemake to s3
-(cd /mpox && snakemake --printshellcmds --configfile config/build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; }
+(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; }
 
 # upload the tree to S3. The variable key is created to use later
 key="${key_prefix}/mpx_czge.json"
-$aws s3 cp /mpox/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}"
+$aws s3 cp /mpox/phylogenetic/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}"
 
 # update aspen
 aspen_workflow_rev=WHATEVER
@@ -111,4 +111,4 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py                 \
     --bucket "${aspen_s3_db_bucket}"                                        \
     --key "${key}"                                                          \
     --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}"           \
-    --tree-path /mpox/auspice/monkeypox_mpxv.json                           \
+    --tree-path /mpox/phylogenetic/auspice/monkeypox_mpxv.json              \

From ac6bab2e2f01e984d7266fd34b33e04744714ddd Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Wed, 4 Sep 2024 17:16:40 -0700
Subject: [PATCH 09/13] Lint roller

---
 src/backend/aspen/workflows/nextstrain_run/export.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py
index ffc522d2c..0140ce655 100644
--- a/src/backend/aspen/workflows/nextstrain_run/export.py
+++ b/src/backend/aspen/workflows/nextstrain_run/export.py
@@ -179,7 +179,11 @@ def dump_yaml_template(
             session, phylo_run.pathogen, phylo_run.template_args, group
         )
         builder: TemplateBuilder = TemplateBuilder(
-            phylo_run.tree_type, phylo_run.pathogen, phylo_run.group, resolved_template_args, **context
+            phylo_run.tree_type,
+            phylo_run.pathogen,
+            phylo_run.group,
+            resolved_template_args,
+            **context,
         )
         builder.write_file(builds_file_fh)
 

From 5e9eb058d6684d372eebb33abd16fa903797d09d Mon Sep 17 00:00:00 2001
From: Dan Lu <20667188+danrlu@users.noreply.github.com>
Date: Thu, 5 Sep 2024 10:36:14 -0500
Subject: [PATCH 10/13] Update run_nextstrain_mpx.sh

---
 .../aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
index e77ff685f..abe63ca37 100755
--- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
+++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh
@@ -88,7 +88,7 @@ $aws s3 cp /mpox/phylogenetic/build_czge.yaml "${s3_prefix}/build_czge.yaml"
 $aws s3 cp /mpox/phylogenetic/data/include.txt "${s3_prefix}/include.txt"
 
 # run snakemake, if run fails export the logs from snakemake to s3
-(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; }
+(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/aspen/logs/ "${s3_prefix}/logs/mpox/" --recursive ; }
 
 # upload the tree to S3. The variable key is created to use later
 key="${key_prefix}/mpx_czge.json"

From 3c09812bd576932a55744a3e7c9a082ac577254d Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Thu, 5 Sep 2024 12:13:57 -0700
Subject: [PATCH 11/13] Fix test after mpox config format changes

---
 .../nextstrain_run/tests/test_mpx_export.py   | 98 +++++++++----------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
index 6932178e2..3be3caebf 100644
--- a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
+++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
@@ -143,13 +143,13 @@ def test_overview_config_no_filters(mocker, session, postgres_database, split_cl
     phylo_run, location = create_test_data(session, split_client, tree_type, 10, 0, 0)
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Just some placeholder sanity-checks
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 500
+    assert subsampling_scheme["group"]["max_sequences"] == 500
     assert (
         subsampling_scheme["group"]["query"]
-        == f"(location == '{location.location}') & (division == '{location.division}')"
+        == f'''--query "(location == '{location.location}') & (division == '{location.division}')"'''
     )
     assert "min-date" not in subsampling_scheme["group"]
     assert "max-date" not in subsampling_scheme["group"]
@@ -168,27 +168,25 @@ def test_overview_config_ondemand(mocker, session, postgres_database, split_clie
     query = {
         "filter_start_date": "2021-04-30",
         "filter_end_date": "10 days ago",
-        "filter_pango_lineages": ["AY", "B.1.116"],
     }
     phylo_run, location = create_test_data(
         session, split_client, tree_type, 10, 5, 5, template_args=query
     )
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     max_date = dateparser.parse("10 days ago").strftime("%Y-%m-%d")
     # Order does not matter for lineages, just verify matched sets.
-    assert subsampling_scheme["group"]["min-date"] == "2021-04-30"
-    assert subsampling_scheme["group"]["max-date"] == f"{max_date}"
+    assert subsampling_scheme["group"]["min_date"] == "--min-date 2021-04-30"
+    assert subsampling_scheme["group"]["max_date"] == f"--max-date {max_date}"
     assert (
-        subsampling_scheme["international_serial_sampling"]["max-date"] == f"{max_date}"
+        subsampling_scheme["international_serial_sampling"]["max_date"] == f"--max-date {max_date}"
     )
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 500
-    filter_pango_lineages = "['" + "', '".join(query["filter_pango_lineages"]) + "']"
+    assert subsampling_scheme["group"]["max_sequences"] == 500
     assert (
         subsampling_scheme["group"]["query"]
-        == f"(location == '{location.location}') & (division == '{location.division}') & (lineage in {filter_pango_lineages})"
+        == f'''--query "(location == '{location.location}') & (division == '{location.division}')"'''
     )
     assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
     assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
@@ -219,14 +217,14 @@ def test_overview_config_chicago(mocker, session, postgres_database, split_clien
     )
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Make sure our query got updated properly
     assert (
         subsampling_scheme["group"]["query"]
-        == f"((location == '{location.location}') & (division == '{location.division}')) | submitting_lab == 'RIPHL at Rush University Medical Center'"
+        == f'''--query "((location == '{location.location}') & (division == '{location.division}')) | submitting_lab == 'RIPHL at Rush University Medical Center'"'''
     )
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 500
+    assert subsampling_scheme["group"]["max_sequences"] == 500
     assert len(selected.splitlines()) == 0  # No selected sequences
     assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
     assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each
@@ -240,10 +238,10 @@ def test_non_contextualized_config(mocker, session, postgres_database, split_cli
     phylo_run, location = create_test_data(session, split_client, tree_type, 10, 5, 5)
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Just some placeholder sanity-checks
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000
+    assert subsampling_scheme["group"]["max_sequences"] == 1000
     assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
     assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
     assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each
@@ -281,21 +279,21 @@ def test_non_contextualized_regions(mocker, session, postgres_database, split_cl
     }.items():
         sequences, selected, metadata, nextstrain_config = generate_run(run.id)
 
-        subsampling_scheme = nextstrain_config["subsampling"]
+        subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
         if run_type == "state":
             assert (
                 subsampling_scheme["group"]["query"]
-                == f"(division == '{state_location.division}') & (country == '{state_location.country}')"
+                == f'''--query "(division == '{state_location.division}') & (country == '{state_location.country}')"'''
             )
         else:
             assert (
                 subsampling_scheme["group"]["query"]
-                == f"(country == '{country_location.country}')"
+                == f'''--query "(country == '{country_location.country}')"'''
             )
 
         # Just some placeholder sanity-checks
-        assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000
+        assert subsampling_scheme["group"]["max_sequences"] == 1000
         assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
         assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
         assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each
@@ -309,14 +307,14 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client
     phylo_run, location = create_test_data(session, split_client, tree_type, 10, 5, 5)
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Just some placeholder sanity-checks
-    assert subsampling_scheme["closest"]["subsample-max-sequences"] == 100
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 50
-    assert subsampling_scheme["state"]["subsample-max-sequences"] == 50
-    assert subsampling_scheme["country"]["subsample-max-sequences"] == 25
-    assert subsampling_scheme["international"]["subsample-max-sequences"] == 25
+    assert subsampling_scheme["closest"]["max_sequences"] == 100
+    assert subsampling_scheme["group"]["max_sequences"] == 50
+    assert subsampling_scheme["state"]["max_sequences"] == 50
+    assert subsampling_scheme["country"]["max_sequences"] == 25
+    assert subsampling_scheme["international"]["max_sequences"] == 25
     assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
     assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
     assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each
@@ -354,26 +352,26 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien
     }.items():
         sequences, selected, metadata, nextstrain_config = generate_run(run.id)
 
-        subsampling_scheme = nextstrain_config["subsampling"]
+        subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
         if run_type == "state":
             assert "state" not in subsampling_scheme.keys()
             assert (
                 subsampling_scheme["group"]["query"]
-                == f"(division == '{state_location.division}') & (country == '{state_location.country}')"
+                == f'''--query "(division == '{state_location.division}') & (country == '{state_location.country}')"'''
             )
         else:
             assert "state" not in subsampling_scheme.keys()
             assert "country" not in subsampling_scheme.keys()
             assert (
                 subsampling_scheme["group"]["query"]
-                == f"(country == '{country_location.country}')"
+                == f'''--query "(country == '{country_location.country}')"'''
             )
 
         # Just some placeholder sanity-checks
-        assert subsampling_scheme["closest"]["subsample-max-sequences"] == 100
-        assert subsampling_scheme["group"]["subsample-max-sequences"] == 50
-        assert subsampling_scheme["international"]["subsample-max-sequences"] == 100
+        assert subsampling_scheme["closest"]["max_sequences"] == 100
+        assert subsampling_scheme["group"]["max_sequences"] == 50
+        assert subsampling_scheme["international"]["max_sequences"] == 100
         assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
         assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
         assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each
@@ -416,14 +414,14 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client)
     )
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
 
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Just some placeholder sanity-checks
-    assert subsampling_scheme["closest"]["subsample-max-sequences"] == 120
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 60
-    assert subsampling_scheme["state"]["subsample-max-sequences"] == 60
-    assert subsampling_scheme["country"]["subsample-max-sequences"] == 30
-    assert subsampling_scheme["international"]["subsample-max-sequences"] == 30
+    assert subsampling_scheme["closest"]["max_sequences"] == 120
+    assert subsampling_scheme["group"]["max_sequences"] == 60
+    assert subsampling_scheme["state"]["max_sequences"] == 60
+    assert subsampling_scheme["country"]["max_sequences"] == 30
+    assert subsampling_scheme["international"]["max_sequences"] == 30
     assert len(selected.splitlines()) == 120  # 10 gisaid samples + 110 selected samples
     assert len(metadata.splitlines()) == 201  # 200 samples + 1 header line
     assert len(sequences.splitlines()) == 400  # 200 county samples, @2 lines each
@@ -469,15 +467,15 @@ def test_overview_config_division(mocker, session, postgres_database, split_clie
         group_location="",
     )
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Make sure our query got updated properly
-    assert subsampling_scheme["country"]["subsample-max-sequences"] == 800
-    assert subsampling_scheme["international"]["subsample-max-sequences"] == 200
+    assert subsampling_scheme["country"]["max_sequences"] == 800
+    assert subsampling_scheme["international"]["max_sequences"] == 200
     assert "state" not in subsampling_scheme.keys()
     assert (
         subsampling_scheme["group"]["query"]
-        == f"(division == '{location.division}') & (country == '{location.country}')"
+        == f'''--query "(division == '{location.division}') & (country == '{location.country}')"'''
     )
 
 
@@ -498,14 +496,16 @@ def test_overview_config_country(mocker, session, postgres_database, split_clien
         group_division="",
     )
     sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
 
     # Make sure our query got updated properly
     assert "state" not in subsampling_scheme.keys()
     assert "country" not in subsampling_scheme.keys()
-    assert subsampling_scheme["international"]["subsample-max-sequences"] == 1000
-    assert subsampling_scheme["group"]["query"] == f"(country == '{location.country}')"
-
+    assert subsampling_scheme["international"]["max_sequences"] == 1000
+    assert (
+        subsampling_scheme["group"]["query"]
+        == f'''--query "(country == '{location.country}')"'''
+    )
 
 # make sure we handle quotes sanely!!!
 def test_string_escapes(mocker, session, postgres_database, split_client):
@@ -524,14 +524,14 @@ def test_string_escapes(mocker, session, postgres_database, split_client):
         group_division="A'Zaz",
     )
     sequences, selected, metadata, nextstrain_config = generate_run(run.id)
-    subsampling_scheme = nextstrain_config["subsampling"]
+    subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]
     assert (
         subsampling_scheme["group"]["query"]
-        == "(location == 'Cote d\\'Ivoire') & (division == 'A\\'Zaz')"
+        == '''--query "(location == 'Cote d\\'Ivoire') & (division == 'A\\'Zaz')"'''
     )
 
     # Just some placeholder sanity-checks
-    assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000
+    assert subsampling_scheme["group"]["max_sequences"] == 1000
     assert len(selected.splitlines()) == 10  # 5 gisaid samples + 5 selected samples
     assert len(metadata.splitlines()) == 11  # 10 samples + 1 header line
     assert len(sequences.splitlines()) == 20  # 10 county samples, @2 lines each

From 466515a0af6b46f40661d67b05a02725b1f0e5c6 Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Thu, 5 Sep 2024 13:15:57 -0700
Subject: [PATCH 12/13] Remove arguments in filter differentiation

---
 .../build_plugins/type_plugins.py             | 51 +++++++++----------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py
index 636bfab0b..df8fe2005 100644
--- a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py
+++ b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py
@@ -271,43 +271,40 @@ def update_subsampling_for_division(subsampling):
 
 
 def apply_filters(config, subsampling, template_args):
-    # MPX format
-    include_arguments_in_filters = False
-    lineage_field = "lineage"
-    if "--query" in subsampling["group"]["query"]:
-        # SC2 format
-        include_arguments_in_filters = True
-        lineage_field = "pango_lineage"
-
+    """NOTE: The filters do NOT currently support filtering MPX by lineage.
+
+    It probably would not be a big lift to change the config builder to support
+    lineage filtering on mpox, but it's unclear if that should happen here in the
+    tree type plugins where we handle the rest of the filters, or somehow get
+    shoehorned into the pathogen plugins instead.
+    Regardless, the filters currently only support "pango_lineage" for SC2 lineage
+    filtering. There is currently no FE or BE code to provide lineage filter support
+    for mpox, and that's where most of the eng effort will be if we want to release
+    mpox lineage filtering in the future. Once the user has a way to pass those params
+    to filter mpox trees using lineage though, it will still be necessary to change
+    the config building process here so lineage filter is correctly handled for mpox
+    and integrates with the downstream snakemake workflow that builds the tree."""
     min_date = template_args.get("filter_start_date")
     if min_date:
         # Support date expressions like "5 days ago" in our cron schedule.
         min_date = dateparser.parse(min_date).strftime("%Y-%m-%d")
-        if include_arguments_in_filters:
-            subsampling["group"][
-                "min_date"
-            ] = f"--min-date {min_date}"  # ex: --max-date 2020-01-01
-        else:
-            subsampling["group"]["min-date"] = str(min_date)  # ex: max-date: 2020-01-01
+        subsampling["group"][
+            "min_date"
+        ] = f"--min-date {min_date}"  # ex: --max-date 2020-01-01
     max_date = template_args.get("filter_end_date")
     if max_date:
         # Support date expressions like "5 days ago" in our cron schedule.
         max_date = dateparser.parse(max_date).strftime("%Y-%m-%d")
-        if include_arguments_in_filters:
-            subsampling["group"][
+        subsampling["group"][
+            "max_date"
+        ] = f"--max-date {max_date}"  # ex: --max-date 2020-01-01
+        if "international_serial_sampling" in subsampling:
+            subsampling["international_serial_sampling"][
                 "max_date"
             ] = f"--max-date {max_date}"  # ex: --max-date 2020-01-01
-            if "international_serial_sampling" in subsampling:
-                subsampling["international_serial_sampling"][
-                    "max_date"
-                ] = f"--max-date {max_date}"  # ex: --max-date 2020-01-01
-        else:
-            subsampling["group"]["max-date"] = str(max_date)  # ex: max-date: 2020-01-01
-            if "international_serial_sampling" in subsampling:
-                subsampling["international_serial_sampling"]["max-date"] = str(
-                    max_date
-                )  # ex: max-date: 2020-01-01
 
+    # Only SC2 supports lineage filtering right now. See above note for details.
+    LINEAGE_FIELD = "pango_lineage"
     pango_lineages = template_args.get("filter_pango_lineages")
     if pango_lineages:
         # Nextstrain is rather particular about the acceptable syntax for
@@ -322,5 +319,5 @@ def apply_filters(config, subsampling, template_args):
         if old_query.endswith('"'):
             end_string = '"'
             old_query = old_query[:-1]
-        pango_query = " & (" + lineage_field + " in {pango_lineage})"
+        pango_query = " & (" + LINEAGE_FIELD + " in {pango_lineage})"
         subsampling["group"]["query"] = old_query + pango_query + end_string

From 33e7e912bb8c8956535c211b1d1ba2640bda7d5c Mon Sep 17 00:00:00 2001
From: Vincent Selhorst-Jones <vselhorst-jones@chanzuckerberg.com>
Date: Thu, 5 Sep 2024 13:39:21 -0700
Subject: [PATCH 13/13] Lint roller. Again.

---
 .../aspen/workflows/nextstrain_run/tests/test_mpx_export.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
index 3be3caebf..24700123b 100644
--- a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
+++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
@@ -181,7 +181,8 @@ def test_overview_config_ondemand(mocker, session, postgres_database, split_clie
     assert subsampling_scheme["group"]["min_date"] == "--min-date 2021-04-30"
     assert subsampling_scheme["group"]["max_date"] == f"--max-date {max_date}"
     assert (
-        subsampling_scheme["international_serial_sampling"]["max_date"] == f"--max-date {max_date}"
+        subsampling_scheme["international_serial_sampling"]["max_date"]
+        == f"--max-date {max_date}"
     )
     assert subsampling_scheme["group"]["max_sequences"] == 500
     assert (
@@ -507,6 +508,7 @@ def test_overview_config_country(mocker, session, postgres_database, split_clien
         == f'''--query "(country == '{location.country}')"'''
     )
 
+
 # make sure we handle quotes sanely!!!
 def test_string_escapes(mocker, session, postgres_database, split_client):
     mock_remote_db_uri(mocker, postgres_database.as_uri())