Skip to content

Commit fe7acfd

Browse files
author
Daniel Cooke
committed
Merge branch 'develop'
2 parents 53bc73a + 43066d5 commit fe7acfd

File tree

9 files changed

+127
-38
lines changed

9 files changed

+127
-38
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ Daniel Cooke and Gerton Lunter
393393

394394
## Citing
395395

396-
Please cite the preprint: [A unified haplotype-based method for accurate and comprehensive variant calling](https://www.biorxiv.org/content/early/2018/10/29/456103).
396+
Please cite the paper: [A unified haplotype-based method for accurate and comprehensive variant calling](https://www.nature.com/articles/s41587-021-00861-3).
397397

398398
## License
399399

scripts/install.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
forests = ['germline', 'somatic']
1818

1919
latest_llvm = 'llvm'
20-
latest_gcc = 'gcc@10'
20+
latest_gcc = 'gcc'
2121

2222
required_curl_version = 7, 41, 0
2323
required_git_version = 2, 7, 0
@@ -262,10 +262,8 @@ def get_brewed_compiler_binaries(homebrew_dir):
262262
else:
263263
gcc_dir = cellar_dir / latest_gcc
264264
gcc_version = os.listdir(str(gcc_dir))[0]
265-
gcc_bin_dir = gcc_dir /gcc_version / 'bin'
266-
gcc_bin_name = latest_gcc.replace('@', '-')
267-
gxx_bin_name = gcc_bin_name.replace('cc', '++')
268-
return gcc_bin_dir / gcc_bin_name, gcc_bin_dir / gxx_bin_name
265+
gcc_bin_dir = gcc_dir / gcc_version / 'bin'
266+
return gcc_bin_dir / 'gcc', gcc_bin_dir / 'g++'
269267

270268
def patch_homebrew_centos_gcc9(homebrew_dir):
271269
homebrew_bin_dir = homebrew_dir / 'bin'

src/config/option_parser.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "exceptions/user_error.hpp"
2323
#include "config.hpp"
2424

25+
#include "core/csr/measures/measure_factory.hpp"
26+
2527
namespace po = boost::program_options;
2628
namespace fs = boost::filesystem;
2729

@@ -776,8 +778,10 @@ OptionMap parse_options(const int argc, const char** argv)
776778
.add(variant_discovery).add(haplotype_generation).add(general_variant_calling)
777779
.add(cancer).add(trio).add(polyclone).add(cell).add(call_filtering);
778780

781+
po::options_description conditional_help_options("Octopus help command line options");
782+
conditional_help_options.add(general).add(call_filtering);
779783
OptionMap vm_init;
780-
po::store(run(po::command_line_parser(argc, argv).options(general).allow_unregistered()), vm_init);
784+
po::store(run(po::command_line_parser(argc, argv).options(conditional_help_options).allow_unregistered()), vm_init);
781785

782786
if (vm_init.count("help") == 1) {
783787
po::store(run(po::command_line_parser(argc, argv).options(general_variant_calling).allow_unregistered()), vm_init);
@@ -821,6 +825,10 @@ OptionMap parse_options(const int argc, const char** argv)
821825
} else {
822826
std::cout << all << std::endl;
823827
}
828+
if (vm_init.count("annotations") == 1) {
829+
std::cout << "Available annotations:\n" << std::endl;
830+
csr::print_all_measures_help(std::cout);
831+
}
824832
return vm_init;
825833
}
826834

src/core/callers/cancer_caller.cpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,9 +1492,9 @@ void CancerCaller::log(const ModelPosteriors& model_posteriors) const
14921492

14931493
namespace debug {
14941494

1495-
template <typename S>
1495+
template <typename S, typename GenotypeReference>
14961496
void print_genotype_posteriors(S&& stream,
1497-
const std::unordered_map<Genotype<IndexedHaplotype<>>, double>& genotype_posteriors,
1497+
std::vector<std::pair<GenotypeReference, double>> genotype_posteriors,
14981498
const std::size_t n = std::numeric_limits<std::size_t>::max())
14991499
{
15001500
const auto m = std::min(n, genotype_posteriors.size());
@@ -1503,14 +1503,10 @@ void print_genotype_posteriors(S&& stream,
15031503
} else {
15041504
stream << "Printing top " << m << " genotype posteriors " << '\n';
15051505
}
1506-
using GenotypeReference = std::reference_wrapper<const Genotype<IndexedHaplotype<>>>;
1507-
std::vector<std::pair<GenotypeReference, double>> v {};
1508-
v.reserve(genotype_posteriors.size());
1509-
std::copy(std::cbegin(genotype_posteriors), std::cend(genotype_posteriors), std::back_inserter(v));
1510-
const auto mth = std::next(std::begin(v), m);
1511-
std::partial_sort(std::begin(v), mth, std::end(v),
1506+
const auto mth = std::next(std::begin(genotype_posteriors), m);
1507+
std::partial_sort(std::begin(genotype_posteriors), mth, std::end(genotype_posteriors),
15121508
[] (const auto& lhs, const auto& rhs) { return lhs.second > rhs.second; });
1513-
std::for_each(std::begin(v), mth,
1509+
std::for_each(std::begin(genotype_posteriors), mth,
15141510
[&] (const auto& p) {
15151511
print_variant_alleles(stream, p.first.get());
15161512
stream << " " << p.second << '\n';
@@ -1544,6 +1540,10 @@ void CancerCaller::log(const GenotypeVector& germline_genotypes,
15441540
auto map_somatic = find_map_genotype(cancer_posteriors);
15451541
auto map_cancer_genotype = map_somatic->first.get();
15461542
somatic_log << "MAP cancer genotype: ";
1543+
auto weighted_cancer_posteriors = zip_cref(cancer_genotypes, somatic_inferences.weighted_genotype_posteriors);
1544+
auto weighted_somatic_log = stream(*debug_log_);
1545+
weighted_somatic_log << "Weighted cancer genotypes... ";
1546+
debug::print_genotype_posteriors(weighted_somatic_log, weighted_cancer_posteriors, 10);
15471547
debug::print_variant_alleles(somatic_log, map_cancer_genotype);
15481548
somatic_log << ' ' << map_somatic->second;
15491549
auto map_marginal_germline = find_map_genotype(germline_genotype_posteriors);
@@ -1553,7 +1553,10 @@ void CancerCaller::log(const GenotypeVector& germline_genotypes,
15531553
marginal_germline_log << ' ' << map_marginal_germline->second;
15541554
}
15551555
if (trace_log_) {
1556-
debug::print_genotype_posteriors(stream(*trace_log_), germline_genotype_posteriors);
1556+
auto weighted_cancer_posteriors = zip_cref(cancer_genotypes, somatic_inferences.weighted_genotype_posteriors);
1557+
auto weighted_somatic_log = stream(*trace_log_);
1558+
weighted_somatic_log << "Weighted cancer genotypes... ";
1559+
debug::print_genotype_posteriors(weighted_somatic_log, weighted_cancer_posteriors);
15571560
}
15581561
}
15591562

src/core/csr/measures/measure_factory.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
#include "exceptions/user_error.hpp"
1212
#include "utils/map_utils.hpp"
1313
#include "utils/string_utils.hpp"
14+
#include "io/variant/vcf_header.hpp"
15+
#include "io/variant/vcf_spec.hpp"
1416

1517
namespace octopus { namespace csr {
1618

@@ -163,5 +165,36 @@ std::vector<std::string> get_all_measure_names()
163165
return extract_sorted_keys(measure_makers);
164166
}
165167

168+
VcfHeader make_header(const std::vector<MeasureWrapper>& measures)
169+
{
170+
VcfHeader::Builder builder {};
171+
for (const auto& measure : measures) {
172+
measure.annotate(builder);
173+
}
174+
return builder.build_once();
175+
}
176+
177+
void print_help(const std::vector<MeasureWrapper>& measures, std::ostream& os)
178+
{
179+
const auto header = make_header(measures);
180+
os << "Name\tKind\tNumber\tType\tDescription" << std::endl;
181+
for (const auto& measure : measures) {
182+
os << measure.name() << '\t';
183+
using namespace vcfspec::header;
184+
const auto kind = header.has(format, measure.name()) ? format : info;
185+
os << kind << '\t';
186+
os << get_id_field_value(header, kind, measure.name(), meta::struc::number) << '\t';
187+
os << get_id_field_value(header, kind, measure.name(), meta::struc::type) << '\t';
188+
os << get_id_field_value(header, kind, measure.name(), meta::struc::description) << '\t';
189+
os << std::endl;
190+
}
191+
}
192+
193+
void print_all_measures_help(std::ostream& os)
194+
{
195+
const auto measures = make_measures(get_all_measure_names());
196+
print_help(measures, os);
197+
}
198+
166199
} // namespace csr
167200
} // namespace octopus

src/core/csr/measures/measure_factory.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <string>
88
#include <vector>
9+
#include <ostream>
910

1011
#include "utils/string_utils.hpp"
1112
#include "measure.hpp"
@@ -18,6 +19,9 @@ std::vector<MeasureWrapper> make_measures(std::vector<std::string> names);
1819

1920
std::vector<std::string> get_all_measure_names();
2021

22+
void print_help(const std::vector<MeasureWrapper>& measures, std::ostream& os);
23+
void print_all_measures_help(std::ostream& os);
24+
2125
} // namespace csr
2226
} // namespace octopus
2327

src/core/models/genotype/subclone_model.cpp

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ compute_genotype_likelihoods_with_fixed_mixture_model(const std::vector<SampleNa
133133
return result;
134134
}
135135

136+
auto evaluate(const Genotype<IndexedHaplotype<>>& genotype, const ConstantMixtureGenotypeLikelihoodModel& model)
137+
{
138+
return model.evaluate(genotype);
139+
}
136140
auto evaluate(const CancerGenotype<IndexedHaplotype<>>& genotype, const ConstantMixtureGenotypeLikelihoodModel& model)
137141
{
138142
return model.evaluate(demote(genotype));
@@ -243,14 +247,41 @@ generate_seeds(const std::vector<SampleName>& samples,
243247
if (result.size() > max_seeds) return result;
244248
max_seeds -= result.size();
245249
result.reserve(max_seeds);
246-
result.push_back(genotype_log_priors);
247-
IndividualModel germline_model {priors.genotype_prior_model};
248-
if (haplotypes) germline_model.prime(*haplotypes);
250+
ConstantMixtureGenotypeLikelihoodModel basic_likelihood_model {haplotype_log_likelihoods};
251+
std::vector<LogProbabilityVector> basic_sample_likelihoods {};
252+
basic_sample_likelihoods.reserve(samples.size());
249253
for (const auto& sample : samples) {
250-
haplotype_log_likelihoods.prime(sample);
251-
auto latents = germline_model.evaluate(genotypes, haplotype_log_likelihoods);
252-
result.push_back(std::move(latents.posteriors.genotype_log_probabilities));
254+
basic_likelihood_model.cache().prime(sample);
255+
basic_sample_likelihoods.push_back(evaluate(genotypes, basic_likelihood_model));
253256
}
257+
auto basic_likelihoods = add_all_and_normalise(basic_sample_likelihoods);
258+
auto basic_posteriors = add_and_normalise(genotype_log_priors, basic_likelihoods);
259+
result.push_back(basic_posteriors);
260+
--max_seeds;
261+
if (max_seeds == 0) return result;
262+
result.push_back(basic_likelihoods);
263+
--max_seeds;
264+
if (max_seeds == 0) return result;
265+
result.push_back(genotype_log_priors);
266+
maths::normalise_logs(result.back());
267+
--max_seeds;
268+
if (max_seeds == 0) return result;
269+
for (auto& likelihoods : basic_sample_likelihoods) {
270+
result.push_back(std::move(likelihoods));
271+
--max_seeds;
272+
if (max_seeds == 0) return result;
273+
}
274+
std::vector<std::pair<double, std::size_t>> ranked_basic_posteriors {};
275+
ranked_basic_posteriors.reserve(genotypes.size());
276+
for (std::size_t i {0}; i < genotypes.size(); ++i) {
277+
ranked_basic_posteriors.push_back({basic_posteriors[i], i});
278+
}
279+
const auto nth_ranked_basic_posteriors_itr = std::next(std::begin(ranked_basic_posteriors), max_seeds);
280+
std::partial_sort(std::begin(ranked_basic_posteriors),
281+
nth_ranked_basic_posteriors_itr,
282+
std::end(ranked_basic_posteriors), std::greater<> {});
283+
std::transform(std::begin(ranked_basic_posteriors), nth_ranked_basic_posteriors_itr,
284+
std::back_inserter(result), [&] (const auto& p) { return make_point_seed(genotypes.size(), p.second); });
254285
return result;
255286
}
256287

src/io/variant/vcf_header.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,20 @@ bool VcfHeader::has(const Tag& t) const noexcept
5151

5252
bool VcfHeader::has(const Tag& t, const StructuredKey& k) const noexcept
5353
{
54-
return structured_fields_.count(t) > 0; // TODO: complete this
54+
const auto er = structured_fields_.equal_range(t);
55+
return std::find_if(er.first, er.second,
56+
[&] (const auto& p) {
57+
return p.second.at(vcfspec::header::meta::struc::id) == k;
58+
}) != er.second;
5559
}
5660

5761
const VcfHeader::Value& VcfHeader::at(const BasicKey& k) const
5862
{
5963
return basic_fields_.at(k);
6064
}
6165

62-
const VcfHeader::Value& VcfHeader::find(const Tag& search_tag, const StructuredKey& search_key,
63-
const StructuredKey& id_key, const Value& id_value) const
66+
const VcfHeader::Value& VcfHeader::find(const Tag& search_tag, const StructuredKey& id_key,
67+
const Value& id_value, const StructuredKey& search_key) const
6468
{
6569
const auto er = structured_fields_.equal_range(search_tag);
6670
return std::find_if(er.first, er.second,
@@ -118,15 +122,19 @@ const VcfHeader::StructuredFieldMap& VcfHeader::structured_fields() const noexce
118122

119123
// non-member methods
120124

121-
const VcfHeader::Value& get_id_field_value(const VcfHeader& header, const VcfHeader::Tag& tag,
122-
const VcfHeader::Value& id_value,
123-
const VcfHeader::StructuredKey& lookup_key)
125+
const VcfHeader::Value&
126+
get_id_field_value(const VcfHeader& header,
127+
const VcfHeader::Tag& tag,
128+
const VcfHeader::StructuredKey& id_value,
129+
const VcfHeader::StructuredKey& lookup_key)
124130
{
125131
return header.find(tag, vcfspec::header::meta::struc::id, id_value, lookup_key);
126132
}
127133

128-
const VcfHeader::Value& get_id_field_type(const VcfHeader& header, const VcfHeader::Tag& tag,
129-
const VcfHeader::Value& id_value)
134+
const VcfHeader::Value&
135+
get_id_field_type(const VcfHeader& header,
136+
const VcfHeader::Tag& tag,
137+
const VcfHeader::Value& id_value)
130138
{
131139
using namespace vcfspec::header::meta;
132140
return header.find(tag, struc::id, id_value, struc::type);

src/io/variant/vcf_header.hpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ class VcfHeader : public Equitable<VcfHeader>
102102
bool has(const Tag& tag, const StructuredKey& k) const noexcept;
103103

104104
const Value& at(const BasicKey& k) const;
105-
const Value& find(const Tag& search_tag, const StructuredKey& search_key,
106-
const StructuredKey& id_key, const Value& id_value) const;
105+
const Value& find(const Tag& search_tag, const StructuredKey& id_key,
106+
const Value& id_value, const StructuredKey& search_key) const;
107107

108108
std::vector<BasicKey> basic_keys() const;
109109
std::vector<Tag> tags() const;
@@ -162,12 +162,16 @@ VcfHeader::VcfHeader(T&& file_format, U&& samples, B&& basic_fields, S&& structu
162162
, structured_fields_ {std::forward<S>(structured_fields)}
163163
{}
164164

165-
const std::string& get_id_field_value(const VcfHeader& header, const VcfHeader::Tag& t,
166-
const VcfHeader::StructuredKey& id_value,
167-
const VcfHeader::StructuredKey& lookup_key);
165+
const VcfHeader::Value&
166+
get_id_field_value(const VcfHeader& header,
167+
const VcfHeader::Tag& t,
168+
const VcfHeader::StructuredKey& id_value,
169+
const VcfHeader::StructuredKey& lookup_key);
168170

169-
const std::string& get_id_field_type(const VcfHeader& header, const VcfHeader::Tag& t,
170-
const VcfHeader::Value& id_value);
171+
const VcfHeader::Value&
172+
get_id_field_type(const VcfHeader& header,
173+
const VcfHeader::Tag& t,
174+
const VcfHeader::Value& id_value);
171175

172176
VcfType get_typed_value(const VcfHeader& header, const VcfHeader::Tag& t,
173177
const VcfHeader::StructuredKey& key, const VcfHeader::Value& value);

0 commit comments

Comments
 (0)