From 222b6b5baf30c6deca5504630eda0ec539fdf16a Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Fri, 30 Aug 2024 11:41:49 -0700 Subject: [PATCH 01/13] Fix yaml dump only to have expected args --- src/backend/aspen/workflows/nextstrain_run/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py index a6872c60f..1e00c5456 100644 --- a/src/backend/aspen/workflows/nextstrain_run/export.py +++ b/src/backend/aspen/workflows/nextstrain_run/export.py @@ -176,7 +176,7 @@ def dump_yaml_template( session, phylo_run.pathogen, phylo_run.template_args, group ) builder: TemplateBuilder = TemplateBuilder( - phylo_run.tree_type, phylo_run.group, resolved_template_args, **context + phylo_run.tree_type, phylo_run.pathogen, phylo_run.group, resolved_template_args, **context ) builder.write_file(builds_file_fh) From ffff0b889d0706694ec03d52e8d506170abffb76 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Fri, 30 Aug 2024 14:11:56 -0700 Subject: [PATCH 02/13] WIP: Alter base MPX template for new mpox flow --- .../builds_templates/MPX_template.yaml | 152 ++++++++++-------- .../aspen/workflows/nextstrain_run/export.py | 3 + 2 files changed, 87 insertions(+), 68 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml index 97a3bda20..cbf0fe02b 100644 --- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml +++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml @@ -1,21 +1,33 @@ -subsampling_scheme: "OVERVIEW" - -exclude: "config/exclude_accessions_mpxv.txt" -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -colors: "config/colors_mpxv.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/auspice_config_mpxv.json" -description: "config/description.md" -clades: "config/clades.tsv" -tree_mask: "config/tree_mask.tsv" +builds: + aspen_mpx: + region: global + country: {country} + division: {division} + location: {location} + subsampling_scheme: {tree_type} + title: CZ Gen Epi MPX Tree # VOODOO verify gets replaced with a more specific title in builder_base.py + +# make sure build_name matches the sub-key of `builds` above. +# Not sure how to consolidate it, but not going to try to figure it out now. +build_name: "aspen_mpx" +auspice_name: "monkeypox_mpxv" # VOODOO Unclear if this gets used or will be overwritten + +reference: "defaults/reference.fasta" +genome_annotation: "defaults/genome_annotation.gff3" +genbank_reference: "defaults/reference.gb" +include: "data/include.txt" # add this to our template +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1/auspice_config.json" # replace this with our own +description: "defaults/description.md" # replace this with our own +tree_mask: "defaults/tree_mask.tsv" strain_id_field: "accession" -display_strain_field: "strain_original" +display_strain_field: "strain" -build_name: "mpxv" -auspice_name: "monkeypox_mpxv" +filter: + min_date: 1950 + min_length: 100000 ## align max_indel: 10000 @@ -25,10 +37,15 @@ seed_spacing: 1000 fix_tree: true ## refine -timetree: false +timetree: true root: "min_dev" clock_rate: 3e-6 clock_std_dev: 6e-6 +divergence_units: "mutations" + +traits: + columns: "" + sampling_bias_correction: 3 ## recency recency: true @@ -36,103 +53,102 @@ recency: true mask: from_beginning: 1350 from_end: 6422 - maskfile: "config/mask_overview.bed" + maskfile: "defaults/mask_overview.bed" + +priorities: + crowding_penalty: 0 ## Subsampling schemas subsampling: OVERVIEW: group: - subsample-max-sequences: 500 - query: "(location == '{location}') & (division == '{division}')" - min-length: 100000 - + group_by: "year month" + max_sequences: 2000 + query: --query "(location == '{location}') & (division == '{division}')" + state: - subsample-max-sequences: 300 - query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} + group_by: "location year month" + max_sequences: 500 + query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} priorities: type: "proximity" focus: "group" - min-length: 100000 country: - subsample-max-sequences: 300 - query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA + group_by: "division year month" + max_sequences: 400 + query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA priorities: type: "proximity" focus: "group" - min-length: 100000 - + international: - subsample-max-sequences: 300 - query: "(country != '{country}')" # this should capture samples that have no division or location info + group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + max_sequences: 100 + query: --query "(country != '{country}')" # exclude add'l samples from USA priorities: type: "proximity" focus: "group" - min-length: 100000 - + international_serial_sampling: - group-by: ["region", "year"] # lots of samples have no "month" so in order to include them, we'll only go by "year" - sequences-per-group: 2 - query: "(country != '{country}')" - min-length: 100000 + group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + seq_per_group: 2 + query: --query "(country != '{country}')" + ######################## TARGETED: focal: - exclude-all: true - + exclude: "--exclude-all" + closest: - subsample-max-sequences: 100 # this changes with number of samples in include.txt and that's good + max_sequences: 100 priorities: type: "proximity" focus: "focal" - min-length: 100000 group: - subsample-max-sequences: 25 - query: "(location == '{location}') & (division == '{division}')" + group_by: "year month" + max_sequences: 25 + query: --query "(location == '{location}') & (division == '{division}')" priorities: type: "proximity" focus: "focal" - min-length: 100000 - + state: - subsample-max-sequences: 25 - query: "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} + group_by: "location year month" + max_sequences: 25 + query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} priorities: type: "proximity" focus: "focal" - min-length: 100000 country: - subsample-max-sequences: 25 - query: "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA + group_by: "division year month" + max_sequences: 25 + query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA priorities: type: "proximity" focus: "focal" - min-length: 100000 - + international: - subsample-max-sequences: 25 - query: "(country != '{country}')" # this should capture samples that have no division or location info + group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + max_sequences: 25 + query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA" priorities: - type: "proximity" - focus: "focal" - min-length: 100000 - + type: "proximity" + focus: "focal" + international_serial_sampling: - group-by: ["region", "year"] # lots of samples have no "month" so in order to include them, we'll only go by "year" - sequences-per-group: 2 - query: "(country != '{country}')" - min-length: 100000 - - + group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + seq_per_group: 2 + query: --query "(country != '{country}')" + + ######################## NON_CONTEXTUALIZED: group: - group-by: - - "year" - subsample-max-sequences: 1000 - query: "(location == '{location}') & (division == '{division}')" - min-length: 100000 + group_by: "year month" # VOODOO: old MPX did just year, not year+month + max_sequences: 2000 + query: --query "(location == '{location}') & (division == '{division}')" diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py index 1e00c5456..ffc522d2c 100644 --- a/src/backend/aspen/workflows/nextstrain_run/export.py +++ b/src/backend/aspen/workflows/nextstrain_run/export.py @@ -153,6 +153,9 @@ def cli( # For local debugging of our yaml building process. +# Would be better to re-structure the main `export_run_config` process so yaml +# output happens earlier and we just exit early if --builds-file-only flag is +# on rather than having a separate code path for that flag being on. def dump_yaml_template( phylo_run_id: int, builds_file_fh: io.TextIOWrapper, From b536f2d1182b4783ec70c760ac5acf52a9828424 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Fri, 30 Aug 2024 16:54:16 -0700 Subject: [PATCH 03/13] WIP: Address notes from talking with Dan --- .../builds_templates/MPX_template.yaml | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml index cbf0fe02b..cdcbcc914 100644 --- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml +++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml @@ -1,5 +1,5 @@ builds: - aspen_mpx: + aspen: region: global country: {country} division: {division} @@ -9,7 +9,7 @@ builds: # make sure build_name matches the sub-key of `builds` above. # Not sure how to consolidate it, but not going to try to figure it out now. -build_name: "aspen_mpx" +build_name: "aspen" auspice_name: "monkeypox_mpxv" # VOODOO Unclear if this gets used or will be overwritten reference: "defaults/reference.fasta" @@ -18,8 +18,8 @@ genbank_reference: "defaults/reference.gb" include: "data/include.txt" # add this to our template clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" -auspice_config: "defaults/hmpxv1/auspice_config.json" # replace this with our own -description: "defaults/description.md" # replace this with our own +auspice_config: "defaults/hmpxv1/auspice_config.json" # VOODOO port over Jess' version of auspice_config.json +description: "defaults/description.md" # VOODOO Port over Jess' current description, Dan will follow up with more edits tree_mask: "defaults/tree_mask.tsv" strain_id_field: "accession" @@ -37,7 +37,7 @@ seed_spacing: 1000 fix_tree: true ## refine -timetree: true +timetree: false root: "min_dev" clock_rate: 3e-6 clock_std_dev: 6e-6 @@ -56,19 +56,17 @@ mask: maskfile: "defaults/mask_overview.bed" priorities: - crowding_penalty: 0 + crowding_penalty: 0 # Gets set by treetype during `export.py` ## Subsampling schemas subsampling: OVERVIEW: group: - group_by: "year month" max_sequences: 2000 query: --query "(location == '{location}') & (division == '{division}')" state: - group_by: "location year month" max_sequences: 500 query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} priorities: @@ -76,7 +74,6 @@ subsampling: focus: "group" country: - group_by: "division year month" max_sequences: 400 query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA priorities: @@ -84,7 +81,6 @@ subsampling: focus: "group" international: - group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" max_sequences: 100 query: --query "(country != '{country}')" # exclude add'l samples from USA priorities: @@ -92,7 +88,7 @@ subsampling: focus: "group" international_serial_sampling: - group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year" seq_per_group: 2 query: --query "(country != '{country}')" @@ -109,7 +105,6 @@ subsampling: focus: "focal" group: - group_by: "year month" max_sequences: 25 query: --query "(location == '{location}') & (division == '{division}')" priorities: @@ -117,7 +112,6 @@ subsampling: focus: "focal" state: - group_by: "location year month" max_sequences: 25 query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} priorities: @@ -125,7 +119,6 @@ subsampling: focus: "focal" country: - group_by: "division year month" max_sequences: 25 query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA priorities: @@ -133,7 +126,6 @@ subsampling: focus: "focal" international: - group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" max_sequences: 25 query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA" priorities: @@ -141,7 +133,7 @@ subsampling: focus: "focal" international_serial_sampling: - group_by: "region year" # VOODOO [copied Jess approach and comment] lots of samples have no "month" so in order to include them, we'll only go by "year" + group_by: "region year" # lots of samples have no "month" so in order to include them, we'll only go by "year" seq_per_group: 2 query: --query "(country != '{country}')" @@ -149,6 +141,6 @@ subsampling: NON_CONTEXTUALIZED: group: - group_by: "year month" # VOODOO: old MPX did just year, not year+month + group_by: "year" max_sequences: 2000 query: --query "(location == '{location}') & (division == '{division}')" From 23c499484d8f7ac5c48cc48f3e3fcc33bd3dd151 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Fri, 30 Aug 2024 16:55:40 -0700 Subject: [PATCH 04/13] WIP: Adjust `max_sequences` to match old MPX template --- .../nextstrain_run/builds_templates/MPX_template.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml index cdcbcc914..8da8d7cfe 100644 --- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml +++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml @@ -63,25 +63,25 @@ subsampling: OVERVIEW: group: - max_sequences: 2000 + max_sequences: 500 query: --query "(location == '{location}') & (division == '{division}')" state: - max_sequences: 500 + max_sequences: 300 query: --query "(location != '{location}') & (division == '{division}')" # exclude add'l samples from {location} priorities: type: "proximity" focus: "group" country: - max_sequences: 400 + max_sequences: 300 query: --query "(division != '{division}') & (country == '{country}')" # exclude add'l samples from CA priorities: type: "proximity" focus: "group" international: - max_sequences: 100 + max_sequences: 300 query: --query "(country != '{country}')" # exclude add'l samples from USA priorities: type: "proximity" @@ -142,5 +142,5 @@ subsampling: NON_CONTEXTUALIZED: group: group_by: "year" - max_sequences: 2000 + max_sequences: 1000 query: --query "(location == '{location}') & (division == '{division}')" From 82cdca1bae178779afe7119bb5f2f26c162c6245 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Tue, 3 Sep 2024 16:03:54 -0700 Subject: [PATCH 05/13] WIP: Keep `builds` in MPX output template --- .../build_plugins/pathogen_plugins.py | 15 +++------------ .../builds_templates/MPX_template.yaml | 4 ++-- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py index 23f8b0fd2..f42e9b6b7 100644 --- a/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py +++ b/src/backend/aspen/workflows/nextstrain_run/build_plugins/pathogen_plugins.py @@ -14,14 +14,9 @@ def update_config(self, config): class MPXPlugin(PathogenPlugin): def update_config(self, config): - build_config = {} - try: - build_config = config["builds"]["aspen"] - config["subsampling_scheme"] = build_config["subsampling_scheme"] - del config["builds"] - except KeyError: - pass - subsampling_scheme = config["subsampling_scheme"] + build_config = config["builds"]["aspen"] + subsampling_scheme = build_config["subsampling_scheme"] + # Create escaped single quotes for interpolation into `--query` sections. escaped_config = {} for k, v in build_config.items(): if type(v) == str: @@ -31,7 +26,3 @@ def update_config(self, config): for _, sample in config["subsampling"][subsampling_scheme].items(): if sample.get("query"): sample["query"] = sample["query"].format(**escaped_config) - if sample.get("max_sequences"): - sample["subsample-max-sequences"] = sample["max_sequences"] - del sample["max_sequences"] - config["subsampling"] = config["subsampling"][subsampling_scheme] diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml index 8da8d7cfe..57fab73bc 100644 --- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml +++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml @@ -5,12 +5,12 @@ builds: division: {division} location: {location} subsampling_scheme: {tree_type} - title: CZ Gen Epi MPX Tree # VOODOO verify gets replaced with a more specific title in builder_base.py + title: CZ Gen Epi MPX Tree # make sure build_name matches the sub-key of `builds` above. # Not sure how to consolidate it, but not going to try to figure it out now. build_name: "aspen" -auspice_name: "monkeypox_mpxv" # VOODOO Unclear if this gets used or will be overwritten +auspice_name: "monkeypox_mpxv" reference: "defaults/reference.fasta" genome_annotation: "defaults/genome_annotation.gff3" From b2c5b9ef102db04f478712bc84979a5179454189 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Wed, 4 Sep 2024 15:12:28 -0700 Subject: [PATCH 06/13] Remove no longer needed comments --- .../nextstrain_run/builds_templates/MPX_template.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml index 57fab73bc..9f134dc25 100644 --- a/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml +++ b/src/backend/aspen/workflows/nextstrain_run/builds_templates/MPX_template.yaml @@ -18,8 +18,8 @@ genbank_reference: "defaults/reference.gb" include: "data/include.txt" # add this to our template clades: "defaults/clades.tsv" lat_longs: "defaults/lat_longs.tsv" -auspice_config: "defaults/hmpxv1/auspice_config.json" # VOODOO port over Jess' version of auspice_config.json -description: "defaults/description.md" # VOODOO Port over Jess' current description, Dan will follow up with more edits +auspice_config: "defaults/legacy_auspice_config_mpxv.json" +description: "defaults/description.md" tree_mask: "defaults/tree_mask.tsv" strain_id_field: "accession" @@ -127,7 +127,7 @@ subsampling: international: max_sequences: 25 - query: --query "(country != '{country}')" # VOODOO: huh? Original comment: "exclude add'l samples from USA" + query: --query "(country != '{country}')" # [Vince] huh? Original comment: "exclude add'l samples from USA" priorities: type: "proximity" focus: "focal" From 7ca2fafc8217c998d0bbd19e9b4ab21d85292f9e Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Wed, 4 Sep 2024 15:27:43 -0700 Subject: [PATCH 07/13] Convert to being compatible with new mpox build --- src/backend/Dockerfile.nextstrain | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/backend/Dockerfile.nextstrain b/src/backend/Dockerfile.nextstrain index d0f0e136b..cc4750f1b 100644 --- a/src/backend/Dockerfile.nextstrain +++ b/src/backend/Dockerfile.nextstrain @@ -56,13 +56,14 @@ RUN mkdir /ncov && \ # Add support for our custom mpox workflow # TODO [Vincent & Dan; Aug 2024]: Update the `mpox` workflow content. +# TODO Convert to a specifc commit rather than overall subsample_by_distance branch RUN mkdir /mpox && \ cd /mpox && \ git init && \ - git remote add origin https://github.com/chanzuckerberg/monkeypox.git && \ - git fetch origin subsampling && \ - git reset --hard fd74f4b5f219035c9cbb7909b6f84f8a06fda76d -RUN chown nextstrain:nextstrain /mpox/config/exclude_accessions_mpxv.txt /mpox/config/clades.tsv + git remote add origin https://github.com/chanzuckerberg/mpox.git && \ + git fetch origin subsample_by_distance && \ + git reset --hard origin/subsample_by_distance +RUN chown nextstrain:nextstrain /mpox/phylogenetic/defaults/exclude_accessions.txt /mpox/phylogenetic/defaults/clades.tsv RUN mkdir -p /ncov/auspice RUN mkdir -p /ncov/logs @@ -83,7 +84,8 @@ COPY . . # Install the aspen package RUN poetry install RUN chmod a+w /ncov/auspice /ncov/logs -RUN chmod a+w /mpox/ /mpox/config +# [Vince] I'm not totally sure we need all three of these, but let's start with them. +RUN chmod a+w /mpox/ /mpox/phylogenetic /mpox/phylogenetic/defaults # TODO - Mismatch between poetry and augur deps forces us to manually install jsonschema v3 here RUN pip install jsonschema==3.* From 5ddc542df652e91edefc56237d1d83c5af6506f9 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Wed, 4 Sep 2024 16:31:04 -0700 Subject: [PATCH 08/13] Modify paths to use latest mpox workflow format --- .../nextstrain_run/run_nextstrain_mpx.sh | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh index 5b1a0e0e3..e77ff685f 100755 --- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh +++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh @@ -33,10 +33,10 @@ set -x # Download the latest mpox exclusions and clades list. This happens at RUN time, not BUILD time so that # we are always building trees with the latest upstream filters. -wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/config/exclude_accessions_mpxv.txt -wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/config/clades.tsv +wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/exclude_accessions.txt -O /mpox/phylogenetic/defaults/exclude_accessions.txt +wget https://raw.githubusercontent.com/nextstrain/mpox/master/phylogenetic/defaults/clades.tsv -O /mpox/phylogenetic/defaults/clades.tsv -mkdir -p /mpox/data +mkdir -p /mpox/phylogenetic/data key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}" s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}" @@ -52,12 +52,12 @@ mpox_git_rev=$(cd /mpox && git rev-parse HEAD) aligned_upstream_location=$( python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py \ --phylo-run-id "${WORKFLOW_ID}" \ - --sequences /mpox/data/sequences_czge.fasta \ - --metadata /mpox/data/metadata_czge.tsv \ - --selected /mpox/data/include.txt \ + --sequences /mpox/phylogenetic/data/sequences_czge.fasta \ + --metadata /mpox/phylogenetic/data/metadata_czge.tsv \ + --selected /mpox/phylogenetic/data/include.txt \ --sequence-type aligned \ --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}" \ - --builds-file /mpox/config/build_czge.yaml \ + --builds-file /mpox/phylogenetic/build_czge.yaml \ --reset-status ) @@ -66,33 +66,33 @@ aligned_upstream_sequences_s3_key=$(echo "${aligned_upstream_location}" | jq -r aligned_upstream_metadata_s3_key=$(echo "${aligned_upstream_location}" | jq -r .metadata_key) # fetch the upstream dataset -if [ ! -e /mpox/data/upstream_sequences.fasta ]; then - $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/data/upstream_sequences.fasta.xz - unxz /mpox/data/*.xz +if [ ! -e /mpox/phylogenetic/data/upstream_sequences.fasta ]; then + $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_sequences_s3_key}" /mpox/phylogenetic/data/upstream_sequences.fasta.xz + unxz /mpox/phylogenetic/data/*.xz fi -if [ ! -e /mpox/data/upstream_metadata.tsv ]; then - $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/data/upstream_metadata.tsv.xz - unxz /mpox/data/*.xz +if [ ! -e /mpox/phylogenetic/data/upstream_metadata.tsv ]; then + $aws s3 cp --no-progress "s3://${aligned_upstream_s3_bucket}/${aligned_upstream_metadata_s3_key}" /mpox/phylogenetic/data/upstream_metadata.tsv.xz + unxz /mpox/phylogenetic/data/*.xz fi # If we've written out any samples, add them to the upstream metadata/fasta files -if [ -e /mpox/data/sequences_czge.fasta ]; then - python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/data/metadata_czge.tsv --required-sequences /mpox/data/sequences_czge.fasta --upstream-metadata /mpox/data/upstream_metadata.tsv --upstream-sequences /mpox/data/upstream_sequences.fasta --destination-metadata /mpox/data/metadata.tsv --destination-sequences /mpox/data/sequences.fasta --required-match-column strain --upstream-match-column accession +if [ -e /mpox/phylogenetic/data/sequences_czge.fasta ]; then + python3 /usr/src/app/aspen/workflows/nextstrain_run/merge_mpx.py --required-metadata /mpox/phylogenetic/data/metadata_czge.tsv --required-sequences /mpox/phylogenetic/data/sequences_czge.fasta --upstream-metadata /mpox/phylogenetic/data/upstream_metadata.tsv --upstream-sequences /mpox/phylogenetic/data/upstream_sequences.fasta --destination-metadata /mpox/phylogenetic/data/metadata.tsv --destination-sequences /mpox/phylogenetic/data/sequences.fasta --required-match-column strain --upstream-match-column accession else - cp /mpox/data/upstream_metadata.tsv /mpox/data/metadata.tsv - cp /mpox/data/upstream_sequences.fasta /mpox/data/sequences.fasta + cp /mpox/phylogenetic/data/upstream_metadata.tsv /mpox/phylogenetic/data/metadata.tsv + cp /mpox/phylogenetic/data/upstream_sequences.fasta /mpox/phylogenetic/data/sequences.fasta fi; # Persist the build config we generated. -$aws s3 cp /mpox/config/build_czge.yaml "${s3_prefix}/build_czge.yaml" -$aws s3 cp /mpox/data/include.txt "${s3_prefix}/include.txt" +$aws s3 cp /mpox/phylogenetic/build_czge.yaml "${s3_prefix}/build_czge.yaml" +$aws s3 cp /mpox/phylogenetic/data/include.txt "${s3_prefix}/include.txt" # run snakemake, if run fails export the logs from snakemake to s3 -(cd /mpox && snakemake --printshellcmds --configfile config/build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; } +(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; } # upload the tree to S3. The variable key is created to use later key="${key_prefix}/mpx_czge.json" -$aws s3 cp /mpox/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}" +$aws s3 cp /mpox/phylogenetic/auspice/monkeypox_mpxv.json "s3://${aspen_s3_db_bucket}/${key}" # update aspen aspen_workflow_rev=WHATEVER @@ -111,4 +111,4 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py \ --bucket "${aspen_s3_db_bucket}" \ --key "${key}" \ --resolved-template-args "${RESOLVED_TEMPLATE_ARGS_SAVEFILE}" \ - --tree-path /mpox/auspice/monkeypox_mpxv.json \ + --tree-path /mpox/phylogenetic/auspice/monkeypox_mpxv.json \ From ac6bab2e2f01e984d7266fd34b33e04744714ddd Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Wed, 4 Sep 2024 17:16:40 -0700 Subject: [PATCH 09/13] Lint roller --- src/backend/aspen/workflows/nextstrain_run/export.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/export.py b/src/backend/aspen/workflows/nextstrain_run/export.py index ffc522d2c..0140ce655 100644 --- a/src/backend/aspen/workflows/nextstrain_run/export.py +++ b/src/backend/aspen/workflows/nextstrain_run/export.py @@ -179,7 +179,11 @@ def dump_yaml_template( session, phylo_run.pathogen, phylo_run.template_args, group ) builder: TemplateBuilder = TemplateBuilder( - phylo_run.tree_type, phylo_run.pathogen, phylo_run.group, resolved_template_args, **context + phylo_run.tree_type, + phylo_run.pathogen, + phylo_run.group, + resolved_template_args, + **context, ) builder.write_file(builds_file_fh) From 5e9eb058d6684d372eebb33abd16fa903797d09d Mon Sep 17 00:00:00 2001 From: Dan Lu <20667188+danrlu@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:36:14 -0500 Subject: [PATCH 10/13] Update run_nextstrain_mpx.sh --- .../aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh index e77ff685f..abe63ca37 100755 --- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh +++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_mpx.sh @@ -88,7 +88,7 @@ $aws s3 cp /mpox/phylogenetic/build_czge.yaml "${s3_prefix}/build_czge.yaml" $aws s3 cp /mpox/phylogenetic/data/include.txt "${s3_prefix}/include.txt" # run snakemake, if run fails export the logs from snakemake to s3 -(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/mpxv/filter.log "${s3_prefix}/logs/mpox/" --recursive ; } +(cd /mpox/phylogenetic && snakemake --printshellcmds --configfile build_czge.yaml --resources=mem_mb=312320) || { $aws s3 cp /mpox/phylogenetic/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /mpox/phylogenetic/results/aspen/logs/ "${s3_prefix}/logs/mpox/" --recursive ; } # upload the tree to S3. The variable key is created to use later key="${key_prefix}/mpx_czge.json" From 3c09812bd576932a55744a3e7c9a082ac577254d Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Thu, 5 Sep 2024 12:13:57 -0700 Subject: [PATCH 11/13] Fix test after mpox config format changes --- .../nextstrain_run/tests/test_mpx_export.py | 98 +++++++++---------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py index 6932178e2..3be3caebf 100644 --- a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py +++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py @@ -143,13 +143,13 @@ def test_overview_config_no_filters(mocker, session, postgres_database, split_cl phylo_run, location = create_test_data(session, split_client, tree_type, 10, 0, 0) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["group"]["subsample-max-sequences"] == 500 + assert subsampling_scheme["group"]["max_sequences"] == 500 assert ( subsampling_scheme["group"]["query"] - == f"(location == '{location.location}') & (division == '{location.division}')" + == f'''--query "(location == '{location.location}') & (division == '{location.division}')"''' ) assert "min-date" not in subsampling_scheme["group"] assert "max-date" not in subsampling_scheme["group"] @@ -168,27 +168,25 @@ def test_overview_config_ondemand(mocker, session, postgres_database, split_clie query = { "filter_start_date": "2021-04-30", "filter_end_date": "10 days ago", - "filter_pango_lineages": ["AY", "B.1.116"], } phylo_run, location = create_test_data( session, split_client, tree_type, 10, 5, 5, template_args=query ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] max_date = dateparser.parse("10 days ago").strftime("%Y-%m-%d") # Order does not matter for lineages, just verify matched sets. - assert subsampling_scheme["group"]["min-date"] == "2021-04-30" - assert subsampling_scheme["group"]["max-date"] == f"{max_date}" + assert subsampling_scheme["group"]["min_date"] == "--min-date 2021-04-30" + assert subsampling_scheme["group"]["max_date"] == f"--max-date {max_date}" assert ( - subsampling_scheme["international_serial_sampling"]["max-date"] == f"{max_date}" + subsampling_scheme["international_serial_sampling"]["max_date"] == f"--max-date {max_date}" ) - assert subsampling_scheme["group"]["subsample-max-sequences"] == 500 - filter_pango_lineages = "['" + "', '".join(query["filter_pango_lineages"]) + "']" + assert subsampling_scheme["group"]["max_sequences"] == 500 assert ( subsampling_scheme["group"]["query"] - == f"(location == '{location.location}') & (division == '{location.division}') & (lineage in {filter_pango_lineages})" + == f'''--query "(location == '{location.location}') & (division == '{location.division}')"''' ) assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line @@ -219,14 +217,14 @@ def test_overview_config_chicago(mocker, session, postgres_database, split_clien ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Make sure our query got updated properly assert ( subsampling_scheme["group"]["query"] - == f"((location == '{location.location}') & (division == '{location.division}')) | submitting_lab == 'RIPHL at Rush University Medical Center'" + == f'''--query "((location == '{location.location}') & (division == '{location.division}')) | submitting_lab == 'RIPHL at Rush University Medical Center'"''' ) - assert subsampling_scheme["group"]["subsample-max-sequences"] == 500 + assert subsampling_scheme["group"]["max_sequences"] == 500 assert len(selected.splitlines()) == 0 # No selected sequences assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each @@ -240,10 +238,10 @@ def test_non_contextualized_config(mocker, session, postgres_database, split_cli phylo_run, location = create_test_data(session, split_client, tree_type, 10, 5, 5) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000 + assert subsampling_scheme["group"]["max_sequences"] == 1000 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each @@ -281,21 +279,21 @@ def test_non_contextualized_regions(mocker, session, postgres_database, split_cl }.items(): sequences, selected, metadata, nextstrain_config = generate_run(run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] if run_type == "state": assert ( subsampling_scheme["group"]["query"] - == f"(division == '{state_location.division}') & (country == '{state_location.country}')" + == f'''--query "(division == '{state_location.division}') & (country == '{state_location.country}')"''' ) else: assert ( subsampling_scheme["group"]["query"] - == f"(country == '{country_location.country}')" + == f'''--query "(country == '{country_location.country}')"''' ) # Just some placeholder sanity-checks - assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000 + assert subsampling_scheme["group"]["max_sequences"] == 1000 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each @@ -309,14 +307,14 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client phylo_run, location = create_test_data(session, split_client, tree_type, 10, 5, 5) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["subsample-max-sequences"] == 100 - assert subsampling_scheme["group"]["subsample-max-sequences"] == 50 - assert subsampling_scheme["state"]["subsample-max-sequences"] == 50 - assert subsampling_scheme["country"]["subsample-max-sequences"] == 25 - assert subsampling_scheme["international"]["subsample-max-sequences"] == 25 + assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["group"]["max_sequences"] == 50 + assert subsampling_scheme["state"]["max_sequences"] == 50 + assert subsampling_scheme["country"]["max_sequences"] == 25 + assert subsampling_scheme["international"]["max_sequences"] == 25 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each @@ -354,26 +352,26 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien }.items(): sequences, selected, metadata, nextstrain_config = generate_run(run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] if run_type == "state": assert "state" not in subsampling_scheme.keys() assert ( subsampling_scheme["group"]["query"] - == f"(division == '{state_location.division}') & (country == '{state_location.country}')" + == f'''--query "(division == '{state_location.division}') & (country == '{state_location.country}')"''' ) else: assert "state" not in subsampling_scheme.keys() assert "country" not in subsampling_scheme.keys() assert ( subsampling_scheme["group"]["query"] - == f"(country == '{country_location.country}')" + == f'''--query "(country == '{country_location.country}')"''' ) # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["subsample-max-sequences"] == 100 - assert subsampling_scheme["group"]["subsample-max-sequences"] == 50 - assert subsampling_scheme["international"]["subsample-max-sequences"] == 100 + assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["group"]["max_sequences"] == 50 + assert subsampling_scheme["international"]["max_sequences"] == 100 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each @@ -416,14 +414,14 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client) ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["subsample-max-sequences"] == 120 - assert subsampling_scheme["group"]["subsample-max-sequences"] == 60 - assert subsampling_scheme["state"]["subsample-max-sequences"] == 60 - assert subsampling_scheme["country"]["subsample-max-sequences"] == 30 - assert subsampling_scheme["international"]["subsample-max-sequences"] == 30 + assert subsampling_scheme["closest"]["max_sequences"] == 120 + assert subsampling_scheme["group"]["max_sequences"] == 60 + assert subsampling_scheme["state"]["max_sequences"] == 60 + assert subsampling_scheme["country"]["max_sequences"] == 30 + assert subsampling_scheme["international"]["max_sequences"] == 30 assert len(selected.splitlines()) == 120 # 10 gisaid samples + 110 selected samples assert len(metadata.splitlines()) == 201 # 200 samples + 1 header line assert len(sequences.splitlines()) == 400 # 200 county samples, @2 lines each @@ -469,15 +467,15 @@ def test_overview_config_division(mocker, session, postgres_database, split_clie group_location="", ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Make sure our query got updated properly - assert subsampling_scheme["country"]["subsample-max-sequences"] == 800 - assert subsampling_scheme["international"]["subsample-max-sequences"] == 200 + assert subsampling_scheme["country"]["max_sequences"] == 800 + assert subsampling_scheme["international"]["max_sequences"] == 200 assert "state" not in subsampling_scheme.keys() assert ( subsampling_scheme["group"]["query"] - == f"(division == '{location.division}') & (country == '{location.country}')" + == f'''--query "(division == '{location.division}') & (country == '{location.country}')"''' ) @@ -498,14 +496,16 @@ def test_overview_config_country(mocker, session, postgres_database, split_clien group_division="", ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Make sure our query got updated properly assert "state" not in subsampling_scheme.keys() assert "country" not in subsampling_scheme.keys() - assert subsampling_scheme["international"]["subsample-max-sequences"] == 1000 - assert subsampling_scheme["group"]["query"] == f"(country == '{location.country}')" - + assert subsampling_scheme["international"]["max_sequences"] == 1000 + assert ( + subsampling_scheme["group"]["query"] + == f'''--query "(country == '{location.country}')"''' + ) # make sure we handle quotes sanely!!! def test_string_escapes(mocker, session, postgres_database, split_client): @@ -524,14 +524,14 @@ def test_string_escapes(mocker, session, postgres_database, split_client): group_division="A'Zaz", ) sequences, selected, metadata, nextstrain_config = generate_run(run.id) - subsampling_scheme = nextstrain_config["subsampling"] + subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] assert ( subsampling_scheme["group"]["query"] - == "(location == 'Cote d\\'Ivoire') & (division == 'A\\'Zaz')" + == '''--query "(location == 'Cote d\\'Ivoire') & (division == 'A\\'Zaz')"''' ) # Just some placeholder sanity-checks - assert subsampling_scheme["group"]["subsample-max-sequences"] == 1000 + assert subsampling_scheme["group"]["max_sequences"] == 1000 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples assert len(metadata.splitlines()) == 11 # 10 samples + 1 header line assert len(sequences.splitlines()) == 20 # 10 county samples, @2 lines each From 466515a0af6b46f40661d67b05a02725b1f0e5c6 Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Thu, 5 Sep 2024 13:15:57 -0700 Subject: [PATCH 12/13] Remove arguments in filter differentiation --- .../build_plugins/type_plugins.py | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py index 636bfab0b..df8fe2005 100644 --- a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py +++ b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py @@ -271,43 +271,40 @@ def update_subsampling_for_division(subsampling): def apply_filters(config, subsampling, template_args): - # MPX format - include_arguments_in_filters = False - lineage_field = "lineage" - if "--query" in subsampling["group"]["query"]: - # SC2 format - include_arguments_in_filters = True - lineage_field = "pango_lineage" - + """NOTE: The filters do NOT currently support filtering MPX by lineage. + + It probably would not be a big lift to change the config builder to support + lineage filtering on mpox, but it's unclear if that should happen here in the + tree type plugins where we handle the rest of the filters, or somehow get + shoehorned into the pathogen plugins instead. + Regardless, the filters currently only support "pango_lineage" for SC2 lineage + filtering. There is currently no FE or BE code to provide lineage filter support + for mpox, and that's where most of the eng effort will be if we want to release + mpox lineage filtering in the future. Once the user has a way to pass those params + to filter mpox trees using lineage though, it will still be necessary to change + the config building process here so lineage filter is correctly handled for mpox + and integrates with the downstream snakemake workflow that builds the tree.""" min_date = template_args.get("filter_start_date") if min_date: # Support date expressions like "5 days ago" in our cron schedule. min_date = dateparser.parse(min_date).strftime("%Y-%m-%d") - if include_arguments_in_filters: - subsampling["group"][ - "min_date" - ] = f"--min-date {min_date}" # ex: --max-date 2020-01-01 - else: - subsampling["group"]["min-date"] = str(min_date) # ex: max-date: 2020-01-01 + subsampling["group"][ + "min_date" + ] = f"--min-date {min_date}" # ex: --max-date 2020-01-01 max_date = template_args.get("filter_end_date") if max_date: # Support date expressions like "5 days ago" in our cron schedule. max_date = dateparser.parse(max_date).strftime("%Y-%m-%d") - if include_arguments_in_filters: - subsampling["group"][ + subsampling["group"][ + "max_date" + ] = f"--max-date {max_date}" # ex: --max-date 2020-01-01 + if "international_serial_sampling" in subsampling: + subsampling["international_serial_sampling"][ "max_date" ] = f"--max-date {max_date}" # ex: --max-date 2020-01-01 - if "international_serial_sampling" in subsampling: - subsampling["international_serial_sampling"][ - "max_date" - ] = f"--max-date {max_date}" # ex: --max-date 2020-01-01 - else: - subsampling["group"]["max-date"] = str(max_date) # ex: max-date: 2020-01-01 - if "international_serial_sampling" in subsampling: - subsampling["international_serial_sampling"]["max-date"] = str( - max_date - ) # ex: max-date: 2020-01-01 + # Only SC2 supports lineage filtering right now. See above note for details. + LINEAGE_FIELD = "pango_lineage" pango_lineages = template_args.get("filter_pango_lineages") if pango_lineages: # Nextstrain is rather particular about the acceptable syntax for @@ -322,5 +319,5 @@ def apply_filters(config, subsampling, template_args): if old_query.endswith('"'): end_string = '"' old_query = old_query[:-1] - pango_query = " & (" + lineage_field + " in {pango_lineage})" + pango_query = " & (" + LINEAGE_FIELD + " in {pango_lineage})" subsampling["group"]["query"] = old_query + pango_query + end_string From 33e7e912bb8c8956535c211b1d1ba2640bda7d5c Mon Sep 17 00:00:00 2001 From: Vincent Selhorst-Jones Date: Thu, 5 Sep 2024 13:39:21 -0700 Subject: [PATCH 13/13] Lint roller. Again. --- .../aspen/workflows/nextstrain_run/tests/test_mpx_export.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py index 3be3caebf..24700123b 100644 --- a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py +++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py @@ -181,7 +181,8 @@ def test_overview_config_ondemand(mocker, session, postgres_database, split_clie assert subsampling_scheme["group"]["min_date"] == "--min-date 2021-04-30" assert subsampling_scheme["group"]["max_date"] == f"--max-date {max_date}" assert ( - subsampling_scheme["international_serial_sampling"]["max_date"] == f"--max-date {max_date}" + subsampling_scheme["international_serial_sampling"]["max_date"] + == f"--max-date {max_date}" ) assert subsampling_scheme["group"]["max_sequences"] == 500 assert ( @@ -507,6 +508,7 @@ def test_overview_config_country(mocker, session, postgres_database, split_clien == f'''--query "(country == '{location.country}')"''' ) + # make sure we handle quotes sanely!!! def test_string_escapes(mocker, session, postgres_database, split_client): mock_remote_db_uri(mocker, postgres_database.as_uri())