Skip to content

Commit 0b8fb93

Browse files
author
Daniel Cooke
committed
Add inverted soft clipped masking
1 parent 01eb88d commit 0b8fb93

File tree

6 files changed

+124
-9
lines changed

6 files changed

+124
-9
lines changed

src/config/option_collation.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,7 @@ bool allow_assembler_generation(const OptionMap& options)
691691
return options.at("assembly-candidate-generator").as<bool>() && !is_fast_mode(options);
692692
}
693693

694-
auto make_read_transformers(const OptionMap& options)
694+
auto make_read_transformers(const ReferenceGenome& reference, const OptionMap& options)
695695
{
696696
using namespace octopus::readpipe;
697697
ReadTransformer prefilter_transformer {}, postfilter_transformer {};
@@ -739,6 +739,9 @@ auto make_read_transformers(const OptionMap& options)
739739
if (options.at("overlap-masking").as<bool>()) {
740740
postfilter_transformer.add(MaskStrandOfDuplicatedBases {});
741741
}
742+
if (options.at("mask-inverted-soft-clipping").as<bool>()) {
743+
prefilter_transformer.add(MaskInvertedSoftClippedReadEnds {reference, 5, 300});
744+
}
742745
prefilter_transformer.shrink_to_fit();
743746
postfilter_transformer.shrink_to_fit();
744747
}
@@ -834,9 +837,9 @@ boost::optional<readpipe::Downsampler> make_downsampler(const OptionMap& options
834837
return boost::none;
835838
}
836839

837-
ReadPipe make_read_pipe(ReadManager& read_manager, std::vector<SampleName> samples, const OptionMap& options)
840+
ReadPipe make_read_pipe(ReadManager& read_manager, const ReferenceGenome& reference, std::vector<SampleName> samples, const OptionMap& options)
838841
{
839-
auto transformers = make_read_transformers(options);
842+
auto transformers = make_read_transformers(reference, options);
840843
if (transformers.second.num_transforms() > 0) {
841844
return ReadPipe {read_manager, std::move(transformers.first), make_read_filterer(options),
842845
std::move(transformers.second), make_downsampler(options), std::move(samples)};
@@ -1970,10 +1973,10 @@ ReadPipe make_default_filter_read_pipe(ReadManager& read_manager, std::vector<Sa
19701973
return ReadPipe {read_manager, std::move(transformer), std::move(filterer), boost::none, std::move(samples)};
19711974
}
19721975

1973-
ReadPipe make_call_filter_read_pipe(ReadManager& read_manager, std::vector<SampleName> samples, const OptionMap& options)
1976+
ReadPipe make_call_filter_read_pipe(ReadManager& read_manager, const ReferenceGenome& reference, std::vector<SampleName> samples, const OptionMap& options)
19741977
{
19751978
if (use_calling_read_pipe_for_call_filtering(options)) {
1976-
return make_read_pipe(read_manager, std::move(samples), options);
1979+
return make_read_pipe(read_manager, reference, std::move(samples), options);
19771980
} else {
19781981
return make_default_filter_read_pipe(read_manager, std::move(samples));
19791982
}

src/config/option_collation.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ boost::optional<std::vector<SampleName>> get_user_samples(const OptionMap& optio
5151

5252
ReadManager make_read_manager(const OptionMap& options);
5353

54-
ReadPipe make_read_pipe(ReadManager& read_manager, std::vector<SampleName> samples, const OptionMap& options);
54+
ReadPipe make_read_pipe(ReadManager& read_manager, const ReferenceGenome& reference, std::vector<SampleName> samples, const OptionMap& options);
5555

5656
bool call_sites_only(const OptionMap& options);
5757

@@ -73,7 +73,7 @@ bool use_calling_read_pipe_for_call_filtering(const OptionMap& options) noexcept
7373

7474
bool keep_unfiltered_calls(const OptionMap& options) noexcept;
7575

76-
ReadPipe make_call_filter_read_pipe(ReadManager& read_manager, std::vector<SampleName> samples, const OptionMap& options);
76+
ReadPipe make_call_filter_read_pipe(ReadManager& read_manager, const ReferenceGenome& reference, std::vector<SampleName> samples, const OptionMap& options);
7777

7878
boost::optional<fs::path> get_output_path(const OptionMap& options);
7979

src/config/option_parser.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ OptionMap parse_options(const int argc, const char** argv)
226226
("overlap-masking",
227227
po::value<bool>()->default_value(true),
228228
"Enable read segment overlap masking")
229+
230+
("mask-inverted-soft-clipping",
231+
po::value<bool>()->default_value(true),
232+
"Enable read segment overlap masking")
229233
;
230234

231235
po::options_description filters("Read filtering");

src/core/calling_components.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ GenomeCallingComponents::Components::Components(ReferenceGenome&& reference, Rea
491491
, regions {get_search_regions(options, this->reference, this->read_manager)}
492492
, contigs {get_contigs(this->regions, this->reference, options::get_contig_output_order(options))}
493493
, reads_profile {profile_reads(this->samples, this->regions, this->read_manager)}
494-
, read_pipe {options::make_read_pipe(this->read_manager, this->samples, options)}
494+
, read_pipe {options::make_read_pipe(this->read_manager, this->reference, this->samples, options)}
495495
, caller_factory {options::make_caller_factory(this->reference, this->read_pipe, this->regions, options, this->reads_profile)}
496496
, filter_read_pipe {}
497497
, output {std::move(output)}
@@ -578,7 +578,7 @@ void GenomeCallingComponents::Components::setup_writers(const options::OptionMap
578578
void GenomeCallingComponents::Components::setup_filter_read_pipe(const options::OptionMap& options)
579579
{
580580
if (!options::use_calling_read_pipe_for_call_filtering(options)) {
581-
filter_read_pipe = options::make_call_filter_read_pipe(read_manager, samples, options);
581+
filter_read_pipe = options::make_call_filter_read_pipe(read_manager, reference, samples, options);
582582
}
583583
}
584584

src/readpipe/transformers/read_transform.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <cassert>
1010

1111
#include "utils/maths.hpp"
12+
#include "utils/sequence_utils.hpp"
1213

1314
namespace octopus { namespace readpipe {
1415

@@ -164,6 +165,17 @@ MaskLowAverageQualitySoftClippedTails::MaskLowAverageQualitySoftClippedTails(Bas
164165

165166
namespace {
166167

168+
auto get_soft_clip_head_size(const AlignedRead& read) noexcept
169+
{
170+
CigarOperation::Size front_size, back_size;
171+
std::tie(front_size, back_size) = get_soft_clipped_sizes(read);
172+
if (read.is_marked_reverse_mapped()) {
173+
return back_size;
174+
} else {
175+
return front_size;
176+
}
177+
}
178+
167179
auto get_soft_clip_tail_size(const AlignedRead& read) noexcept
168180
{
169181
CigarOperation::Size front_size, back_size;
@@ -192,6 +204,15 @@ auto mean_tail_quality(const AlignedRead& read, const std::size_t num_bases) noe
192204
}
193205
}
194206

207+
void zero_head_base_qualities(AlignedRead& read, const std::size_t num_bases) noexcept
208+
{
209+
if (read.is_marked_reverse_mapped()) {
210+
zero_back_qualities(read, num_bases);
211+
} else {
212+
zero_front_qualities(read, num_bases);
213+
}
214+
}
215+
195216
void zero_tail_base_qualities(AlignedRead& read, const std::size_t num_bases) noexcept
196217
{
197218
if (read.is_marked_reverse_mapped()) {
@@ -214,6 +235,78 @@ void MaskLowAverageQualitySoftClippedTails::operator()(AlignedRead& read) const
214235
}
215236
}
216237

238+
MaskInvertedSoftClippedReadEnds::MaskInvertedSoftClippedReadEnds(const ReferenceGenome& reference,
239+
AlignedRead::NucleotideSequence::size_type min_clip_length,
240+
GenomicRegion::Size max_flank_search)
241+
: reference_ {reference}
242+
, min_clip_length_ {min_clip_length}
243+
, max_flank_search_ {max_flank_search}
244+
{}
245+
246+
namespace {
247+
248+
auto copy_head_sequence(const AlignedRead& read, const AlignedRead::NucleotideSequence::size_type length)
249+
{
250+
if (length >= sequence_size(read)) return read.sequence();
251+
if (read.is_marked_reverse_mapped()) {
252+
return AlignedRead::NucleotideSequence {std::prev(std::cend(read.sequence()), length), std::cend(read.sequence())};
253+
} else {
254+
return AlignedRead::NucleotideSequence {std::cbegin(read.sequence()), std::next(std::cbegin(read.sequence()), length)};
255+
}
256+
}
257+
258+
auto copy_tail_sequence(const AlignedRead& read, const AlignedRead::NucleotideSequence::size_type length)
259+
{
260+
if (length >= sequence_size(read)) return read.sequence();
261+
if (read.is_marked_reverse_mapped()) {
262+
return AlignedRead::NucleotideSequence {std::cbegin(read.sequence()), std::next(std::cbegin(read.sequence()), length)};
263+
} else {
264+
return AlignedRead::NucleotideSequence {std::prev(std::cend(read.sequence()), length), std::cend(read.sequence())};
265+
}
266+
}
267+
268+
auto copy_soft_clipped_head_sequence(const AlignedRead& read)
269+
{
270+
return copy_head_sequence(read, get_soft_clip_head_size(read));
271+
}
272+
273+
auto copy_soft_clipped_tail_sequence(const AlignedRead& read)
274+
{
275+
return copy_tail_sequence(read, get_soft_clip_tail_size(read));
276+
}
277+
278+
template <typename Range1, typename Range2>
279+
bool includes(const Range1& target, const Range2& query)
280+
{
281+
return std::search(std::cbegin(target), std::cend(target), std::cbegin(query), std::cend(query)) != std::cend(target);
282+
}
283+
284+
} // namespace
285+
286+
void MaskInvertedSoftClippedReadEnds::operator()(AlignedRead& read) const
287+
{
288+
if (is_soft_clipped(read)) {
289+
const auto soft_clipped_head_length = get_soft_clip_head_size(read);
290+
if (soft_clipped_head_length >= min_clip_length_) {
291+
auto query = copy_head_sequence(read, soft_clipped_head_length);
292+
utils::reverse_complement(query);
293+
auto target = reference_.get().fetch_sequence(expand(mapped_region(read), max_flank_search_));
294+
if (includes(target, query)) {
295+
zero_head_base_qualities(read, soft_clipped_head_length);
296+
}
297+
}
298+
const auto soft_clipped_tail_length = get_soft_clip_tail_size(read);
299+
if (soft_clipped_tail_length >= min_clip_length_) {
300+
auto query = copy_tail_sequence(read, soft_clipped_tail_length);
301+
utils::reverse_complement(query);
302+
auto target = reference_.get().fetch_sequence(expand(mapped_region(read), max_flank_search_));
303+
if (includes(target, query)) {
304+
zero_tail_base_qualities(read, soft_clipped_tail_length);
305+
}
306+
}
307+
}
308+
}
309+
217310
// template transforms
218311

219312
void mask_adapter_contamination(AlignedRead& forward, AlignedRead& reverse) noexcept

src/readpipe/transformers/read_transform.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <vector>
99

1010
#include "basics/aligned_read.hpp"
11+
#include "io/reference/reference_genome.hpp"
1112

1213
namespace octopus { namespace readpipe {
1314

@@ -133,6 +134,20 @@ struct MaskLowAverageQualitySoftClippedTails
133134
const Length min_tail_length_;
134135
};
135136

137+
struct MaskInvertedSoftClippedReadEnds
138+
{
139+
MaskInvertedSoftClippedReadEnds(const ReferenceGenome& reference,
140+
AlignedRead::NucleotideSequence::size_type min_clip_length,
141+
GenomicRegion::Size max_flank_search);
142+
143+
void operator()(AlignedRead& read) const;
144+
145+
private:
146+
std::reference_wrapper<const ReferenceGenome> reference_;
147+
AlignedRead::NucleotideSequence::size_type min_clip_length_;
148+
GenomicRegion::Size max_flank_search_;
149+
};
150+
136151
using ReadReferenceVector = std::vector<std::reference_wrapper<AlignedRead>>;
137152

138153
struct MaskTemplateAdapters

0 commit comments

Comments
 (0)