Skip to content

Commit

Permalink
collect more detailed statistics about read merging
Browse files Browse the repository at this point in the history
To be included in JSON/HTML reports
  • Loading branch information
MikkelSchubert committed Jul 20, 2024
1 parent 6a50047 commit 51f8d31
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 31 deletions.
32 changes: 20 additions & 12 deletions src/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,10 +287,7 @@ strip_mate_info(std::string& header, const char mate_sep)
}

sequence_merger::sequence_merger()
: m_mate_sep(MATE_SEPARATOR)
, m_merge_strategy(merge_strategy::conservative)
, m_max_score(PHRED_OFFSET_MAX)
, m_rng()
: m_merge_strategy(merge_strategy::conservative)
{
}

Expand Down Expand Up @@ -321,7 +318,7 @@ sequence_merger::set_rng(std::mt19937* rng)
void
sequence_merger::merge(const alignment_info& alignment,
fastq& read1,
const fastq& read2) const
const fastq& read2)
{
AR_REQUIRE(m_merge_strategy != merge_strategy::none);
AR_REQUIRE((m_merge_strategy == merge_strategy::original) == !!m_rng);
Expand Down Expand Up @@ -356,6 +353,9 @@ sequence_merger::merge(const alignment_info& alignment,
}
}

m_reads_merged += 2;
m_bases_merged += read_2_offset;

// Remove mate number from read, if present
if (m_mate_sep) {
strip_mate_info(read1.m_header, m_mate_sep);
Expand All @@ -366,7 +366,7 @@ void
sequence_merger::original_merge(char& nt_1,
char& qual_1,
char nt_2,
char qual_2) const
char qual_2)
{
if (nt_1 == 'N' || nt_2 == 'N') {
// If one of the bases are N, then we suppose that we just have (at
Expand All @@ -387,6 +387,8 @@ sequence_merger::original_merge(char& nt_1,
nt_1 = 'N';
qual_1 = PHRED_OFFSET_MIN;
}

m_mismatches_unresolved++;
} else {
// Ensure that nt_1 / qual_1 always contains the preferred nt / score
// This is an assumption of the g_updated_phred_scores cache.
Expand All @@ -397,20 +399,23 @@ sequence_merger::original_merge(char& nt_1,

const phred_scores& new_scores = phred_scores::update(qual_1, qual_2);

if (nt_1 == nt_2) {
qual_1 = new_scores.identical_nts;
} else {
qual_1 = new_scores.different_nts;
m_mismatches_resolved++;
}

qual_1 = std::min<char>(m_max_score,
(nt_1 == nt_2) ? new_scores.identical_nts
: new_scores.different_nts);
qual_1 = std::min<char>(m_max_score, qual_1);
}
}

void
sequence_merger::conservative_merge(char& nt_1,
char& qual_1,
char nt_2,
char qual_2) const
const char nt_2,
const char qual_2)
{

if (nt_2 == 'N' || nt_1 == 'N') {
if (nt_1 == 'N' && nt_2 == 'N') {
qual_1 = PHRED_OFFSET_MIN;
Expand All @@ -424,12 +429,15 @@ sequence_merger::conservative_merge(char& nt_1,
if (qual_1 < qual_2) {
nt_1 = nt_2;
qual_1 = qual_2 - qual_1 + PHRED_OFFSET_MIN;
m_mismatches_resolved++;
} else if (qual_1 > qual_2) {
qual_1 = qual_1 - qual_2 + PHRED_OFFSET_MIN;
m_mismatches_resolved++;
} else {
// No way to reasonably pick a base
nt_1 = 'N';
qual_1 = PHRED_OFFSET_MIN;
m_mismatches_unresolved++;
}
}
}
Expand Down
43 changes: 28 additions & 15 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,32 +230,45 @@ class sequence_merger
* quality scores are assigned based on the merge_strategy chosen.
* The sequences are assumed to have been trimmed using the given alignment.
* This function will produce undefined results if that is not the case!
*
* TODO: Simplify to take a "size_t overlap" param instead of "alignment".
*/
void merge(const alignment_info& alignment,
fastq& read1,
const fastq& read2) const;
void merge(const alignment_info& alignment, fastq& read1, const fastq& read2);

/* Returns number of reads merged */
size_t reads_merged() const { return m_reads_merged; }

/* Returns number of bases merged */
size_t bases_merged() const { return m_bases_merged; }

/* Returns number of mismatches where a higher quality base was selected */
size_t mismatches_resolved() const { return m_mismatches_resolved; }

/* Returns number of mismatches where there was no higher quality base */
size_t mismatches_unresolved() const { return m_mismatches_unresolved; }

private:
/** The original merging algorithm implemented in AdapterRemoval. */
void original_merge(char& nt_1, char& qual_1, char nt_2, char qual_2) const;
void original_merge(char& nt_1, char& qual_1, char nt_2, char qual_2);

/** Alternative merging algorithm added in 2.4.0. */
void conservative_merge(char& nt_1,
char& qual_1,
char nt_2,
char qual_2) const;
void conservative_merge(char& nt_1, char& qual_1, char nt_2, char qual_2);

//! Mate separator used in read names
char m_mate_sep;
char m_mate_sep = MATE_SEPARATOR;
//! Strategy used when merging reads
merge_strategy m_merge_strategy;
//! Maximum score when recalculating qualities in non-conservative mode
char m_max_score;
//! Optional RNG for picking bases at random for mismatches with the same
//! quality
std::mt19937* m_rng;
char m_max_score = PHRED_OFFSET_MAX;
//! RNG for picking a random base for mismatches with the same quality
std::mt19937* m_rng = nullptr;

//! Total number of reads merged
size_t m_reads_merged = 0;
//! The number of bases merged
size_t m_bases_merged = 0;
//! The number of mismatches where a higher quality base was selected
size_t m_mismatches_resolved = 0;
//! The number of mismatches where there was no higher quality base
size_t m_mismatches_unresolved = 0;
};

/**
Expand Down
8 changes: 4 additions & 4 deletions src/trimming.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,10 +527,6 @@ pe_reads_processor::process(chunk_ptr chunk)
// Merge read_2 into read_1
merger.merge(alignment, read_1, read_2);

// Track amount of overlapping bases "lost" due to read merging
stats->reads_merged.inc_reads(2);
stats->reads_merged.inc_bases(post_trimmed_bp - read_1.length());

// Add (optional) user specified prefix to read names
read_1.add_prefix_to_name(m_config.prefix_merged);

Expand Down Expand Up @@ -615,6 +611,10 @@ pe_reads_processor::process(chunk_ptr chunk)
chunks.add(read_2, type_2);
}

// Track amount of overlapping bases "lost" due to read merging
stats->reads_merged.inc_reads(merger.reads_merged());
stats->reads_merged.inc_bases(merger.bases_merged());

m_stats.release(stats);
m_rngs.release(rng);

Expand Down

0 comments on commit 51f8d31

Please sign in to comment.