Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

130 neat crashes with target bed file 500 lines #131

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# NEAT has a new home
NEAT is now a part of the NCSA github and active development will continue here. Please direct issues, comments, and requests to the NCSA issue tracker. Submit pull requests here insead of the old repo.

# NEAT v4.2.X
- Several bug fixes
- Removed bin scoring (needs to be updated, and we didn't have time, but we'll bring it back)
- Incremental improvements

# NEAT v4.2
- After several bug fixes that constituted release 4.1 and some minor releases, we are ready to release an overhauled vesion of NEAT 4.0.
- Removed GC bias - it had little to no effect and made implementation nearly impossible
Expand Down
25 changes: 14 additions & 11 deletions neat/read_simulator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,20 @@ def read_simulator_runner(config: str, output: str):

# TODO check into SeqIO.index_db()
reference_index = SeqIO.index(str(options.reference), "fasta")
reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()}
_LOG.debug("Reference file indexed.")

if _LOG.getEffectiveLevel() < 20:
count = 0
for contig in reference_index:
count += len(reference_index[contig])
for contig in reference_keys_with_lens:
count += reference_keys_with_lens[contig]
_LOG.debug(f"Length of reference: {count / 1_000_000:.2f} Mb")

input_variants_dict = {x: ContigVariants() for x in reference_index}
input_variants_dict = {x: ContigVariants() for x in reference_keys_with_lens}
if options.include_vcf:
_LOG.info(f"Reading input VCF: {options.include_vcf}.")
if options.cancer:
# TODO Check if we need full ref index or just keys and lens
sample_names = parse_input_vcf(
input_variants_dict,
options.include_vcf,
Expand All @@ -202,6 +204,7 @@ def read_simulator_runner(config: str, output: str):
tumor_ind = sample_names['tumor_sample']
normal_ind = sample_names['normal_sample']
else:
# TODO Check if we need full ref index or just keys and lens
sample_names = parse_input_vcf(
input_variants_dict,
options.include_vcf,
Expand All @@ -224,7 +227,7 @@ def read_simulator_runner(config: str, output: str):
target_regions_dict,
discard_regions_dict,
mutation_rate_dict
) = parse_beds(options, reference_index, mut_model.avg_mut_rate)
) = parse_beds(options, reference_keys_with_lens, mut_model.avg_mut_rate)

if any(bed_files):
_LOG.debug("Finished reading input beds.")
Expand All @@ -234,7 +237,7 @@ def read_simulator_runner(config: str, output: str):
if options.produce_bam:
# This is a dictionary that is the list of the contigs and the length of each.
# This information will be needed later to create the bam header.
bam_header = {key: len(reference_index[key]) for key in reference_index}
bam_header = reference_keys_with_lens

# Creates files and sets up objects for files that can be written to as needed.
# Also creates headers for bam and vcf.
Expand All @@ -259,7 +262,7 @@ def read_simulator_runner(config: str, output: str):
"""
_LOG.info("Beginning simulation.")

breaks = find_file_breaks(reference_index)
breaks = find_file_breaks(reference_keys_with_lens)

_LOG.debug("Input reference partitioned for run")

Expand Down Expand Up @@ -345,20 +348,20 @@ def read_simulator_runner(config: str, output: str):

if options.produce_bam:
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
contig_list = list(reference_index)
contig_list = list(reference_keys_with_lens)
contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)


def find_file_breaks(reference_index: dict) -> dict:
def find_file_breaks(reference_keys_with_lens: dict) -> dict:
"""
Returns a dictionary with the chromosomes as keys, which is the start of building the chromosome map

:param reference_index: a dictionary with chromosome keys and sequence values
:param reference_keys_with_lens: a dictionary with chromosome keys and sequence values
:return: a dictionary containing the chromosomes as keys and either "all" for values, or a list of indices
"""
partitions = {}
for contig in reference_index.keys():
partitions[contig] = [(0, len(reference_index[contig]))]
for contig in reference_keys_with_lens:
partitions[contig] = [(0, reference_keys_with_lens[contig])]

return partitions
33 changes: 17 additions & 16 deletions neat/read_simulator/utils/bed_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
_LOG = logging.getLogger(__name__)


def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mutation_rate: float) -> list:
def parse_beds(options: Options, ref_keys_counts: dict, average_mutation_rate: float) -> list:
"""
This single function parses the three possible bed file types for NEAT.

:param options: The options object for this run
:param reference_dict: The reference dict generated by SeqIO for this run
:param ref_keys_counts: A dictionary containing the reference keys and the associated lengths.
:param average_mutation_rate: The average mutation rate from the model or user input for this run
:return: target_dict, discard_dict, mutation_rate_dict
"""
Expand Down Expand Up @@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
for i in range(len(bed_files)):
temp_bed_dict = parse_single_bed(
bed_files[i],
reference_dict,
ref_keys_counts,
processing_structure[i]
)
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
if processing_structure[i][2]:
final_dict = fill_out_bed_dict(reference_dict, temp_bed_dict, processing_structure[i])
final_dict = fill_out_bed_dict(ref_keys_counts, temp_bed_dict, processing_structure[i])
else:
final_dict = temp_bed_dict

Expand All @@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu


def parse_single_bed(input_bed: str,
reference_dictionary: _IndexedSeqFileDict,
ref_keys_counts: dict,
process_key: tuple[str, float, bool],
) -> dict:
"""
Expand All @@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str,
then we will also import the mutation data.

:param input_bed: A bed file containing the regions of interest
:param reference_dictionary: A list of chromosomes to check, along with their reads and lengths
:param ref_keys_counts: A dict of chromosomes to check (key), along with their lengths (value)
:param process_key: Indicates which bed we are processing and the factor we need to process it.
:return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...]
"""
ret_dict = {x: [] for x in reference_dictionary}
ret_dict = {x: [] for x in ref_keys_counts}
in_bed_only = []
key_pool = list(ref_keys_counts.keys())

# process_key[2] is True when there is an input bed, False otherwise.
if process_key[2]:
Expand All @@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str,
sys.exit(1)
# Bed file chromosome names must match the reference.
try:
assert my_chr in reference_dictionary
assert my_chr in key_pool
except AssertionError:
in_bed_only.append(my_chr)
_LOG.warning(
Expand Down Expand Up @@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str,
ret_dict[my_chr].append((int(pos1), int(pos2), True))

# some validation
in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
in_ref_only = [k for k in ref_keys_counts if k not in ret_dict]
if in_ref_only:
_LOG.warning(f'Warning: Reference contains sequences not found in BED file {input_bed}. '
'These chromosomes will be omitted from the outputs.')
Expand All @@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str,
_LOG.debug(f'Regions ignored: {list(set(in_bed_only))}')

else:
for my_chr in reference_dictionary.keys():
ret_dict[my_chr].append((0, len(reference_dictionary[my_chr]), process_key[1]))
for my_chr in ref_keys_counts:
ret_dict[my_chr].append((0, ref_keys_counts[my_chr], process_key[1]))

return ret_dict


def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
def fill_out_bed_dict(ref_keys_counts: dict,
region_dict: dict | None,
default_factor: tuple[str, bool, bool],
) -> dict:
Expand All @@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,

The input to this function is the dict for a single chromosome.

:param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access
the data it refers to.
:param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
:param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as
the length of the contig. Needed to calculate max values.
:param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes
present in the reference.
:param default_factor: This is a tuple representing the type and any extra information. The extra info is for
mutation rates. If beds were input, then these values come from the bed, else they are set to one value
Expand All @@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,

for contig in region_dict:
regions = []
max_value = len(ref_dict[contig])
max_value = ref_keys_counts[contig]
start = 0

# The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
Expand Down