From b5f4cd278daa1e174fafa824d5cfa04cbddab4b2 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 3 Nov 2023 21:16:58 +0100 Subject: [PATCH] feat: add dataset info to output json [wip] --- .../nextclade-cli/src/cli/nextclade_loop.rs | 1 + .../src/cli/nextclade_ordered_writer.rs | 9 +++++- .../src/dataset/dataset_download.rs | 4 ++- packages_rs/nextclade/src/io/dataset.rs | 30 +++++++++++++++++++ packages_rs/nextclade/src/io/results_json.rs | 20 ++++++++++--- .../nextclade/src/run/nextclade_wasm.rs | 8 +++++ 6 files changed, 66 insertions(+), 6 deletions(-) diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs index 81dfc9b18..fa02a1bad 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_loop.rs @@ -124,6 +124,7 @@ pub fn nextclade_run(run_args: NextcladeRunArgs) -> Result<(), Report> { } = nextclade.get_initial_data(); let mut output_writer = NextcladeOrderedWriter::new( + &inputs.dataset_info, &nextclade.gene_map, clade_node_attr_key_descs, phenotype_attr_descs, diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs index 6a145b3d0..fd0ee98f9 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_ordered_writer.rs @@ -6,6 +6,7 @@ use log::{info, warn}; use nextclade::alphabet::nuc::from_nuc_seq; use nextclade::analyze::virus_properties::PhenotypeAttrDesc; use nextclade::gene::gene_map::GeneMap; +use nextclade::io::dataset::DatasetInfoShort; use nextclade::io::fasta::{FastaPeptideWriter, FastaRecord, FastaWriter}; use nextclade::io::ndjson::NdjsonFileWriter; use nextclade::io::nextclade_csv::{CsvColumnConfig, NextcladeResultsCsvFileWriter}; @@ -35,6 +36,7 @@ pub struct NextcladeOrderedWriter { impl NextcladeOrderedWriter { pub fn new( + dataset_info: &DatasetInfoShort, gene_map: &GeneMap, clade_node_attr_key_descs: &[CladeNodeAttrKeyDesc], phenotype_attr_key_desc: &[PhenotypeAttrDesc], @@ -50,7 +52,12 @@ impl NextcladeOrderedWriter { .map_ref_fallible(|output_translations| FastaPeptideWriter::new(gene_map, output_translations))?; let output_json_writer = output_params.output_json.map_ref_fallible(|output_json| { - ResultsJsonWriter::new(output_json, clade_node_attr_key_descs, phenotype_attr_key_desc) + ResultsJsonWriter::new( + dataset_info, + output_json, + clade_node_attr_key_descs, + phenotype_attr_key_desc, + ) })?; let output_ndjson_writer = output_params.output_ndjson.map_ref_fallible(NdjsonFileWriter::new)?; diff --git a/packages_rs/nextclade-cli/src/dataset/dataset_download.rs b/packages_rs/nextclade-cli/src/dataset/dataset_download.rs index fe9920f5c..2eae57f17 100644 --- a/packages_rs/nextclade-cli/src/dataset/dataset_download.rs +++ b/packages_rs/nextclade-cli/src/dataset/dataset_download.rs @@ -6,7 +6,9 @@ use itertools::Itertools; use log::LevelFilter; use nextclade::analyze::virus_properties::{LabelledMutationsConfig, VirusProperties}; use nextclade::gene::gene_map::{filter_gene_map, GeneMap}; -use nextclade::io::dataset::{Dataset, DatasetFiles, DatasetMeta, DatasetsIndexJson}; +use nextclade::io::dataset::{ + Dataset, DatasetAttributeValue, DatasetAttributes, DatasetFiles, DatasetInfoShort, DatasetsIndexJson, DatasetMeta, +}; use nextclade::io::fasta::{read_one_fasta, read_one_fasta_str}; use nextclade::io::file::create_file_or_stdout; use nextclade::io::fs::{ensure_dir, has_extension, read_file_to_string}; diff --git a/packages_rs/nextclade/src/io/dataset.rs b/packages_rs/nextclade/src/io/dataset.rs index e1cc3c645..5dcc21264 100644 --- a/packages_rs/nextclade/src/io/dataset.rs +++ b/packages_rs/nextclade/src/io/dataset.rs @@ -79,6 +79,15 @@ pub struct Dataset { } impl Dataset { + pub fn short_info(&self) -> DatasetInfoShort { + DatasetInfoShort { + path: self.path.clone(), + version: self.version.clone(), + server: None, + other: serde_json::Value::default(), + } + } + pub fn name(&self) -> Option<&str> { self.attributes.get("name").and_then(AnyType::as_str_maybe) } @@ -331,3 +340,24 @@ pub struct MinimizerIndexVersion { #[serde(flatten)] pub other: serde_json::Value, } + +#[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DatasetInfoShort { + pub path: String, + + #[serde(default, skip_serializing_if = "DatasetVersion::is_empty")] + pub version: DatasetVersion, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub server: Option, + + #[serde(flatten)] + pub other: serde_json::Value, +} + +impl DatasetInfoShort { + pub fn from_str(s: impl AsRef) -> Result { + json_parse(s) + } +} diff --git a/packages_rs/nextclade/src/io/results_json.rs b/packages_rs/nextclade/src/io/results_json.rs index 61ef82ab9..2264d59c3 100644 --- a/packages_rs/nextclade/src/io/results_json.rs +++ b/packages_rs/nextclade/src/io/results_json.rs @@ -1,4 +1,5 @@ use crate::analyze::virus_properties::PhenotypeAttrDesc; +use crate::io::dataset::DatasetInfoShort; use crate::io::json::{json_stringify, json_write, JsonPretty}; use crate::io::ndjson::NdjsonWriter; use crate::tree::tree::CladeNodeAttrKeyDesc; @@ -6,10 +7,10 @@ use crate::types::outputs::{ combine_outputs_and_errors_sorted, NextcladeErrorOutputs, NextcladeOutputOrError, NextcladeOutputs, }; use crate::utils::datetime::date_iso_now; +use crate::utils::info::this_package_version_str; use eyre::Report; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; -use crate::utils::info::this_package_version_str; #[derive(Serialize, Deserialize, schemars::JsonSchema)] #[serde(rename_all = "camelCase")] @@ -21,6 +22,8 @@ pub struct ResultsJson { #[serde(skip_serializing_if = "Option::is_none")] pub nextclade_web_version: Option, + pub dataset_info: DatasetInfoShort, + pub created_at: String, pub clade_node_attr_keys: Vec, @@ -33,11 +36,16 @@ pub struct ResultsJson { } impl ResultsJson { - pub fn new(clade_node_attrs: &[CladeNodeAttrKeyDesc], phenotype_attr_keys: &[PhenotypeAttrDesc]) -> Self { + pub fn new( + dataset_info: &DatasetInfoShort, + clade_node_attrs: &[CladeNodeAttrKeyDesc], + phenotype_attr_keys: &[PhenotypeAttrDesc], + ) -> Self { Self { schema_version: "3.0.0".to_owned(), nextclade_algo_version: this_package_version_str().to_owned(), nextclade_web_version: None, + dataset_info: dataset_info.clone(), created_at: date_iso_now(), clade_node_attr_keys: clade_node_attrs.to_vec(), phenotype_attr_keys: phenotype_attr_keys.to_vec(), @@ -47,13 +55,14 @@ impl ResultsJson { } pub fn from_outputs( + dataset_info: &DatasetInfoShort, outputs: &[NextcladeOutputs], errors: &[NextcladeErrorOutputs], clade_node_attrs: &[CladeNodeAttrKeyDesc], phenotype_attr_keys: &[PhenotypeAttrDesc], nextclade_web_version: &Option, ) -> Self { - let mut this = Self::new(clade_node_attrs, phenotype_attr_keys); + let mut this = Self::new(dataset_info, clade_node_attrs, phenotype_attr_keys); this.results = outputs.to_vec(); this.errors = errors.to_vec(); this.nextclade_web_version = nextclade_web_version.clone(); @@ -68,13 +77,14 @@ pub struct ResultsJsonWriter { impl ResultsJsonWriter { pub fn new( + dataset_info: &DatasetInfoShort, filepath: impl AsRef, clade_node_attrs: &[CladeNodeAttrKeyDesc], phenotype_attr_keys: &[PhenotypeAttrDesc], ) -> Result { Ok(Self { filepath: filepath.as_ref().to_owned(), - result: ResultsJson::new(clade_node_attrs, phenotype_attr_keys), + result: ResultsJson::new(dataset_info, clade_node_attrs, phenotype_attr_keys), }) } @@ -103,6 +113,7 @@ impl Drop for ResultsJsonWriter { } pub fn results_to_json_string( + dataset_info: &DatasetInfoShort, outputs: &[NextcladeOutputs], errors: &[NextcladeErrorOutputs], clade_node_attrs: &[CladeNodeAttrKeyDesc], @@ -110,6 +121,7 @@ pub fn results_to_json_string( nextclade_web_version: &Option, ) -> Result { let results_json = ResultsJson::from_outputs( + dataset_info, outputs, errors, clade_node_attrs, diff --git a/packages_rs/nextclade/src/run/nextclade_wasm.rs b/packages_rs/nextclade/src/run/nextclade_wasm.rs index beaa67338..bb90ad359 100644 --- a/packages_rs/nextclade/src/run/nextclade_wasm.rs +++ b/packages_rs/nextclade/src/run/nextclade_wasm.rs @@ -8,6 +8,7 @@ use crate::analyze::phenotype::get_phenotype_attr_descs; use crate::analyze::virus_properties::{AaMotifsDesc, PhenotypeAttrDesc, VirusProperties}; use crate::gene::gene_map::GeneMap; use crate::graph::graph::{convert_auspice_tree_to_graph, convert_graph_to_auspice_tree}; +use crate::io::dataset::DatasetInfoShort; use crate::io::fasta::{read_one_fasta_str, FastaRecord}; use crate::io::nextclade_csv::CsvColumnConfig; use crate::io::nwk_writer::convert_graph_to_nwk_string; @@ -22,6 +23,7 @@ use crate::types::outputs::NextcladeOutputs; use eyre::{Report, WrapErr}; use itertools::Itertools; use schemars::JsonSchema; +use serde::__private::de::IdentifierDeserializer; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::str::FromStr; @@ -34,6 +36,7 @@ pub struct NextcladeParams { pub gene_map: GeneMap, pub tree: Option, pub virus_properties: VirusProperties, + pub dataset_info: DatasetInfoShort, } impl NextcladeParams { @@ -52,11 +55,14 @@ impl NextcladeParams { |gene_map| GeneMap::from_str(gene_map).wrap_err("When parsing genome annotation"), )?; + let dataset_info = DatasetInfoShort::from_str(&raw.dataset_info).wrap_err("When parsing dataset short info")?; + Ok(Self { ref_record, gene_map, tree, virus_properties, + dataset_info, }) } } @@ -69,6 +75,7 @@ pub struct NextcladeParamsRaw { pub gene_map: Option, pub tree: Option, pub virus_properties: String, + pub dataset_info: String, } #[derive(Clone, Debug, Serialize, Deserialize, schemars::JsonSchema)] @@ -158,6 +165,7 @@ impl Nextclade { gene_map, tree, virus_properties, + dataset_info, } = inputs; let params = NextcladeInputParams::from_optional(params, &virus_properties)?;