From 6230b6304fd6d5355f9b6e6e391bb455039638ee Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 24 Oct 2024 13:10:27 -0500 Subject: [PATCH 1/5] Updated changelog --- ChangeLog.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog.md b/ChangeLog.md index 55bb068a..991b4502 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,6 +1,11 @@ # NEAT has a new home NEAT is now a part of the NCSA github and active development will continue here. Please direct issues, comments, and requests to the NCSA issue tracker. Submit pull requests here insead of the old repo. +# NEAT v4.2.X +- Several bug fixes +- Removed bin scoring (needs to be updated, and we didn't have time, but we'll bring it back) +- Incremental improvements + # NEAT v4.2 - After several bug fixes that constituted release 4.1 and some minor releases, we are ready to release an overhauled vesion of NEAT 4.0. - Removed GC bias - it had little to no effect and made implementation nearly impossible From 651bef6d04c8947dfad43bdeb50ad0ea2f8654f6 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 30 Oct 2024 14:16:08 -0500 Subject: [PATCH 2/5] Fixing a bug that was causing memory to grow out of control while parsing bed file --- neat/read_simulator/runner.py | 25 +++++++++++--------- neat/read_simulator/utils/bed_func.py | 33 ++++++++++++++------------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 916b21bf..874c7ac7 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -177,18 +177,20 @@ def read_simulator_runner(config: str, output: str): # TODO check into SeqIO.index_db() reference_index = SeqIO.index(str(options.reference), "fasta") + reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()} _LOG.debug("Reference file indexed.") if _LOG.getEffectiveLevel() < 20: count = 0 - for contig in reference_index: - count += len(reference_index[contig]) + for contig in reference_keys_with_lens: + count += reference_keys_with_lens[contig] _LOG.debug(f"Length of reference: {count / 1_000_000:.2f} Mb") - input_variants_dict = {x: ContigVariants() for x in reference_index} + input_variants_dict = {x: ContigVariants() for x in reference_keys_with_lens} if options.include_vcf: _LOG.info(f"Reading input VCF: {options.include_vcf}.") if options.cancer: + # TODO Check if we need full ref index or just keys and lens sample_names = parse_input_vcf( input_variants_dict, options.include_vcf, @@ -202,6 +204,7 @@ def read_simulator_runner(config: str, output: str): tumor_ind = sample_names['tumor_sample'] normal_ind = sample_names['normal_sample'] else: + # TODO Check if we need full ref index or just keys and lens sample_names = parse_input_vcf( input_variants_dict, options.include_vcf, @@ -224,7 +227,7 @@ def read_simulator_runner(config: str, output: str): target_regions_dict, discard_regions_dict, mutation_rate_dict - ) = parse_beds(options, reference_index, mut_model.avg_mut_rate) + ) = parse_beds(options, reference_keys_with_lens, mut_model.avg_mut_rate) if any(bed_files): _LOG.debug("Finished reading input beds.") @@ -234,7 +237,7 @@ def read_simulator_runner(config: str, output: str): if options.produce_bam: # This is a dictionary that is the list of the contigs and the length of each. # This information will be needed later to create the bam header. - bam_header = {key: len(reference_index[key]) for key in reference_index} + bam_header = reference_keys_with_lens # Creates files and sets up objects for files that can be written to as needed. # Also creates headers for bam and vcf. @@ -259,7 +262,7 @@ def read_simulator_runner(config: str, output: str): """ _LOG.info("Beginning simulation.") - breaks = find_file_breaks(reference_index) + breaks = find_file_breaks(reference_keys_with_lens) _LOG.debug("Input reference partitioned for run") @@ -345,20 +348,20 @@ def read_simulator_runner(config: str, output: str): if options.produce_bam: _LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}") - contig_list = list(reference_index) + contig_list = list(reference_keys_with_lens) contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))} output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len) -def find_file_breaks(reference_index: dict) -> dict: +def find_file_breaks(reference_keys_with_lens: dict) -> dict: """ Returns a dictionary with the chromosomes as keys, which is the start of building the chromosome map - :param reference_index: a dictionary with chromosome keys and sequence values + :param reference_keys_with_lens: a dictionary with chromosome keys and sequence values :return: a dictionary containing the chromosomes as keys and either "all" for values, or a list of indices """ partitions = {} - for contig in reference_index.keys(): - partitions[contig] = [(0, len(reference_index[contig]))] + for contig in reference_keys_with_lens: + partitions[contig] = [(0, reference_keys_with_lens[contig])] return partitions diff --git a/neat/read_simulator/utils/bed_func.py b/neat/read_simulator/utils/bed_func.py index c0c43ff5..99d1171e 100644 --- a/neat/read_simulator/utils/bed_func.py +++ b/neat/read_simulator/utils/bed_func.py @@ -19,12 +19,12 @@ _LOG = logging.getLogger(__name__) -def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mutation_rate: float) -> list: +def parse_beds(options: Options, ref_keys_counts: dict, average_mutation_rate: float) -> list: """ This single function parses the three possible bed file types for NEAT. :param options: The options object for this run - :param reference_dict: The reference dict generated by SeqIO for this run + :param ref_keys_counts: A dictionary containing the reference keys and the associated lengths. :param average_mutation_rate: The average mutation rate from the model or user input for this run :return: target_dict, discard_dict, mutation_rate_dict """ @@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu for i in range(len(bed_files)): temp_bed_dict = parse_single_bed( bed_files[i], - reference_dict, + ref_keys_counts, processing_structure[i] ) # If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map # of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed. if processing_structure[i][2]: - final_dict = fill_out_bed_dict(reference_dict, temp_bed_dict, processing_structure[i]) + final_dict = fill_out_bed_dict(ref_keys_counts, temp_bed_dict, processing_structure[i]) else: final_dict = temp_bed_dict @@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu def parse_single_bed(input_bed: str, - reference_dictionary: _IndexedSeqFileDict, + ref_keys_counts: dict, process_key: tuple[str, float, bool], ) -> dict: """ @@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str, then we will also import the mutation data. :param input_bed: A bed file containing the regions of interest - :param reference_dictionary: A list of chromosomes to check, along with their reads and lengths + :param ref_keys_counts: A dict of chromosomes to check (key), along with their lengths (value) :param process_key: Indicates which bed we are processing and the factor we need to process it. :return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...] """ - ret_dict = {x: [] for x in reference_dictionary} + ret_dict = {x: [] for x in ref_keys_counts} in_bed_only = [] + key_pool = list(ref_keys_counts.keys()) # process_key[2] is True when there is an input bed, False otherwise. if process_key[2]: @@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str, sys.exit(1) # Bed file chromosome names must match the reference. try: - assert my_chr in reference_dictionary + assert my_chr in key_pool except AssertionError: in_bed_only.append(my_chr) _LOG.warning( @@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str, ret_dict[my_chr].append((int(pos1), int(pos2), True)) # some validation - in_ref_only = [k for k in reference_dictionary if k not in ret_dict] + in_ref_only = [k for k in ref_keys_counts if k not in ret_dict] if in_ref_only: _LOG.warning(f'Warning: Reference contains sequences not found in BED file {input_bed}. ' 'These chromosomes will be omitted from the outputs.') @@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str, _LOG.debug(f'Regions ignored: {list(set(in_bed_only))}') else: - for my_chr in reference_dictionary.keys(): - ret_dict[my_chr].append((0, len(reference_dictionary[my_chr]), process_key[1])) + for my_chr in ref_keys_counts: + ret_dict[my_chr].append((0, ref_keys_counts[my_chr], process_key[1])) return ret_dict -def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict, +def fill_out_bed_dict(ref_keys_counts: dict, region_dict: dict | None, default_factor: tuple[str, bool, bool], ) -> dict: @@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict, The input to this function is the dict for a single chromosome. - :param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access - the data it refers to. - :param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes + :param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as + the length of the contig. Needed to calculate max values. + :param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes present in the reference. :param default_factor: This is a tuple representing the type and any extra information. The extra info is for mutation rates. If beds were input, then these values come from the bed, else they are set to one value @@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict, for contig in region_dict: regions = [] - max_value = len(ref_dict[contig]) + max_value = ref_keys_counts[contig] start = 0 # The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence) From 19ddfee4643d1be8e5ae7b9be7333493ad33dc1b Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 30 Oct 2024 14:27:42 -0500 Subject: [PATCH 3/5] temporary patch when NEAT has a bad alignment (usually related to N-regions) --- neat/read_simulator/utils/read.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py index ae80d8d8..d4ab9c08 100644 --- a/neat/read_simulator/utils/read.py +++ b/neat/read_simulator/utils/read.py @@ -453,10 +453,12 @@ def make_cigar(self): cig_string, cig_count, curr_char, cig_length = self.align_seqs() if cig_length < self.run_read_length: _LOG.warning("Poor alignment, trying reversed") - cig_string2, cig_count2, curr_char2, cig_length = self.align_seqs() - if cig_length < self.run_read_length: - _LOG.error("Alignment still not working") - sys.exit(1) + cig_string2, cig_count2, curr_char2, cig_length2 = self.align_seqs() + if cig_length2 < cig_length: + cig_length = cig_length2 + while cig_length < self.run_read_length: + cig_string += "M" + cig_length += 1 # append the final section as we return return cig_string + str(cig_count) + curr_char From 78ef19d933b4c9a766c5ac36e46c8df077459b25 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sun, 12 Jan 2025 19:14:11 -0600 Subject: [PATCH 4/5] Fixing bam output --- environment.yml | 6 +++--- neat/read_simulator/utils/output_file_writer.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index d9ef1e86..56e6783f 100644 --- a/environment.yml +++ b/environment.yml @@ -1,8 +1,8 @@ name: neat channels: - - bioconda - - conda-forge - - defaults + - conda-forge + - bioconda + dependencies: - python=3.10.* - biopython=1.79 diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index f35da461..3309ff3f 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -10,6 +10,7 @@ import gzip import re +import shutil import time from struct import pack import logging @@ -101,7 +102,7 @@ def __init__(self, files_to_write.extend(self.fastq_fns) elif self.write_fastq: self.fastq1_fn = options.output.parent / f'{options.output.stem}.fastq.gz' - self.fastq2_fn = options.output.parent / "dummy.fastq.gz" + self.fastq2_fn = self.temporary_dir / "dummy.fastq.gz" self.fastq_fns = [self.fastq1_fn, self.fastq2_fn] files_to_write.extend(self.fastq_fns) if self.write_bam: @@ -115,7 +116,7 @@ def __init__(self, self.files_to_write = files_to_write # Create files as applicable - for file in [x for x in self.files_to_write if x != "dummy.fastq.gz"]: + for file in [x for x in self.files_to_write if x.name != "dummy.fastq.gz"]: validate_output_path(file, True, options.overwrite_output) mode = 'xt' From ffa45a116a5fb4b495348e6eef580bee38e857c2 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 20 Jan 2025 16:37:32 -0600 Subject: [PATCH 5/5] Fixing bug in error models --- neat/models/error_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neat/models/error_models.py b/neat/models/error_models.py index 66085430..29ac4554 100644 --- a/neat/models/error_models.py +++ b/neat/models/error_models.py @@ -189,7 +189,7 @@ def get_sequencing_errors( if self.average_error == 0: return introduced_errors else: - for i in range(self.read_length): + for i in range(len(quality_scores)): if rng.random() < quality_score_error_rate[quality_scores[i]]: error_indexes.append(i)