diff --git a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py index df8fe2005..75d67d331 100644 --- a/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py +++ b/src/backend/aspen/workflows/nextstrain_run/build_plugins/type_plugins.py @@ -207,9 +207,9 @@ def run_type_config(self, config, subsampling): subsampling["international"]["mynewsetting"] = "mynewvalue" """ # Adjust group sizes if we have a lot of samples. - closest_max_sequences = 100 + closest_max_sequences = 250 other_max_sequences = 25 - if self.num_included_samples >= 100: + if self.num_included_samples >= 250: closest_max_sequences = self.num_included_samples other_max_sequences = int(ceil(self.num_included_samples / 4.0)) diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_export.py index e7cf6df97..6844decb0 100644 --- a/src/backend/aspen/workflows/nextstrain_run/tests/test_export.py +++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_export.py @@ -371,7 +371,7 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["closest"]["max_sequences"] == 250 assert subsampling_scheme["group"]["max_sequences"] == 50 assert subsampling_scheme["state"]["max_sequences"] == 50 assert subsampling_scheme["country"]["max_sequences"] == 25 @@ -430,7 +430,7 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien ) # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["closest"]["max_sequences"] == 250 assert subsampling_scheme["group"]["max_sequences"] == 50 assert subsampling_scheme["international"]["max_sequences"] == 100 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples @@ -468,20 +468,20 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client) mock_remote_db_uri(mocker, postgres_database.as_uri()) tree_type = TreeType.TARGETED - phylo_run = create_test_data(session, split_client, tree_type, 200, 110, 10) + phylo_run = create_test_data(session, split_client, tree_type, 400, 270, 10) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 120 - assert subsampling_scheme["group"]["max_sequences"] == 60 - assert subsampling_scheme["state"]["max_sequences"] == 60 - assert subsampling_scheme["country"]["max_sequences"] == 30 - assert subsampling_scheme["international"]["max_sequences"] == 30 - assert len(selected.splitlines()) == 120 # 10 gisaid samples + 110 selected samples - assert len(metadata.splitlines()) == 201 # 200 samples + 1 header line - assert len(sequences.splitlines()) == 400 # 200 county samples, @2 lines each + assert subsampling_scheme["closest"]["max_sequences"] == 280 + assert subsampling_scheme["group"]["max_sequences"] == 140 + assert subsampling_scheme["state"]["max_sequences"] == 140 + assert subsampling_scheme["country"]["max_sequences"] == 70 + assert subsampling_scheme["international"]["max_sequences"] == 70 + assert len(selected.splitlines()) == 280 # 10 gisaid samples + 270 selected samples + assert len(metadata.splitlines()) == 401 # 200 samples + 1 header line + assert len(sequences.splitlines()) == 800 # 200 county samples, @2 lines each def generate_run(phylo_run_id, reset_status=False): diff --git a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py index 24700123b..6e27b3f7c 100644 --- a/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py +++ b/src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py @@ -311,7 +311,7 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["closest"]["max_sequences"] == 250 assert subsampling_scheme["group"]["max_sequences"] == 50 assert subsampling_scheme["state"]["max_sequences"] == 50 assert subsampling_scheme["country"]["max_sequences"] == 25 @@ -370,7 +370,7 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien ) # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 100 + assert subsampling_scheme["closest"]["max_sequences"] == 250 assert subsampling_scheme["group"]["max_sequences"] == 50 assert subsampling_scheme["international"]["max_sequences"] == 100 assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples @@ -411,21 +411,21 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client) tree_type = TreeType.TARGETED phylo_run, location = create_test_data( - session, split_client, tree_type, 200, 110, 10 + session, split_client, tree_type, 400, 270, 10 ) sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id) subsampling_scheme = nextstrain_config["subsampling"][tree_type.value] # Just some placeholder sanity-checks - assert subsampling_scheme["closest"]["max_sequences"] == 120 - assert subsampling_scheme["group"]["max_sequences"] == 60 - assert subsampling_scheme["state"]["max_sequences"] == 60 - assert subsampling_scheme["country"]["max_sequences"] == 30 - assert subsampling_scheme["international"]["max_sequences"] == 30 - assert len(selected.splitlines()) == 120 # 10 gisaid samples + 110 selected samples - assert len(metadata.splitlines()) == 201 # 200 samples + 1 header line - assert len(sequences.splitlines()) == 400 # 200 county samples, @2 lines each + assert subsampling_scheme["closest"]["max_sequences"] == 280 + assert subsampling_scheme["group"]["max_sequences"] == 140 + assert subsampling_scheme["state"]["max_sequences"] == 140 + assert subsampling_scheme["country"]["max_sequences"] == 70 + assert subsampling_scheme["international"]["max_sequences"] == 70 + assert len(selected.splitlines()) == 280 # 10 gisaid samples + 270 selected samples + assert len(metadata.splitlines()) == 401 # 200 samples + 1 header line + assert len(sequences.splitlines()) == 800 # 200 county samples, @2 lines each def generate_run(phylo_run_id, reset_status=False):