Skip to content

Commit

Permalink
Bump up number of closest sequences for targeted trees (chanzuckerber…
Browse files Browse the repository at this point in the history
…g#1711)

* Update type_plugins.py

* Update type_plugins.py

* Update test_export.py

* Update test_mpx_export.py

* Update test_export.py
  • Loading branch information
danrlu authored and thanhleviet committed Sep 12, 2024
1 parent a488731 commit abbc407
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,9 @@ def run_type_config(self, config, subsampling):
subsampling["international"]["mynewsetting"] = "mynewvalue"
"""
# Adjust group sizes if we have a lot of samples.
closest_max_sequences = 100
closest_max_sequences = 250
other_max_sequences = 25
if self.num_included_samples >= 100:
if self.num_included_samples >= 250:
closest_max_sequences = self.num_included_samples
other_max_sequences = int(ceil(self.num_included_samples / 4.0))

Expand Down
22 changes: 11 additions & 11 deletions src/backend/aspen/workflows/nextstrain_run/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client
subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 100
assert subsampling_scheme["closest"]["max_sequences"] == 250
assert subsampling_scheme["group"]["max_sequences"] == 50
assert subsampling_scheme["state"]["max_sequences"] == 50
assert subsampling_scheme["country"]["max_sequences"] == 25
Expand Down Expand Up @@ -430,7 +430,7 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien
)

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 100
assert subsampling_scheme["closest"]["max_sequences"] == 250
assert subsampling_scheme["group"]["max_sequences"] == 50
assert subsampling_scheme["international"]["max_sequences"] == 100
assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples
Expand Down Expand Up @@ -468,20 +468,20 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client)
mock_remote_db_uri(mocker, postgres_database.as_uri())

tree_type = TreeType.TARGETED
phylo_run = create_test_data(session, split_client, tree_type, 200, 110, 10)
phylo_run = create_test_data(session, split_client, tree_type, 400, 270, 10)
sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)

subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 120
assert subsampling_scheme["group"]["max_sequences"] == 60
assert subsampling_scheme["state"]["max_sequences"] == 60
assert subsampling_scheme["country"]["max_sequences"] == 30
assert subsampling_scheme["international"]["max_sequences"] == 30
assert len(selected.splitlines()) == 120 # 10 gisaid samples + 110 selected samples
assert len(metadata.splitlines()) == 201 # 200 samples + 1 header line
assert len(sequences.splitlines()) == 400 # 200 county samples, @2 lines each
assert subsampling_scheme["closest"]["max_sequences"] == 280
assert subsampling_scheme["group"]["max_sequences"] == 140
assert subsampling_scheme["state"]["max_sequences"] == 140
assert subsampling_scheme["country"]["max_sequences"] == 70
assert subsampling_scheme["international"]["max_sequences"] == 70
assert len(selected.splitlines()) == 280 # 10 gisaid samples + 270 selected samples
assert len(metadata.splitlines()) == 401 # 200 samples + 1 header line
assert len(sequences.splitlines()) == 800 # 200 county samples, @2 lines each


def generate_run(phylo_run_id, reset_status=False):
Expand Down
22 changes: 11 additions & 11 deletions src/backend/aspen/workflows/nextstrain_run/tests/test_mpx_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def test_targeted_config_simple(mocker, session, postgres_database, split_client
subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 100
assert subsampling_scheme["closest"]["max_sequences"] == 250
assert subsampling_scheme["group"]["max_sequences"] == 50
assert subsampling_scheme["state"]["max_sequences"] == 50
assert subsampling_scheme["country"]["max_sequences"] == 25
Expand Down Expand Up @@ -370,7 +370,7 @@ def test_targeted_config_regions(mocker, session, postgres_database, split_clien
)

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 100
assert subsampling_scheme["closest"]["max_sequences"] == 250
assert subsampling_scheme["group"]["max_sequences"] == 50
assert subsampling_scheme["international"]["max_sequences"] == 100
assert len(selected.splitlines()) == 10 # 5 gisaid samples + 5 selected samples
Expand Down Expand Up @@ -411,21 +411,21 @@ def test_targeted_config_large(mocker, session, postgres_database, split_client)

tree_type = TreeType.TARGETED
phylo_run, location = create_test_data(
session, split_client, tree_type, 200, 110, 10
session, split_client, tree_type, 400, 270, 10
)
sequences, selected, metadata, nextstrain_config = generate_run(phylo_run.id)

subsampling_scheme = nextstrain_config["subsampling"][tree_type.value]

# Just some placeholder sanity-checks
assert subsampling_scheme["closest"]["max_sequences"] == 120
assert subsampling_scheme["group"]["max_sequences"] == 60
assert subsampling_scheme["state"]["max_sequences"] == 60
assert subsampling_scheme["country"]["max_sequences"] == 30
assert subsampling_scheme["international"]["max_sequences"] == 30
assert len(selected.splitlines()) == 120 # 10 gisaid samples + 110 selected samples
assert len(metadata.splitlines()) == 201 # 200 samples + 1 header line
assert len(sequences.splitlines()) == 400 # 200 county samples, @2 lines each
assert subsampling_scheme["closest"]["max_sequences"] == 280
assert subsampling_scheme["group"]["max_sequences"] == 140
assert subsampling_scheme["state"]["max_sequences"] == 140
assert subsampling_scheme["country"]["max_sequences"] == 70
assert subsampling_scheme["international"]["max_sequences"] == 70
assert len(selected.splitlines()) == 280 # 10 gisaid samples + 270 selected samples
assert len(metadata.splitlines()) == 401 # 200 samples + 1 header line
assert len(sequences.splitlines()) == 800 # 200 county samples, @2 lines each


def generate_run(phylo_run_id, reset_status=False):
Expand Down

0 comments on commit abbc407

Please sign in to comment.