Skip to content

Commit c61ee1a

Browse files
authored
Merge pull request #82 from bacpop/flexible_lineages
Lineage model fitting - sketchlib changes
2 parents a7ea4cc + 413f57e commit c61ee1a

File tree

7 files changed

+120
-115
lines changed

7 files changed

+120
-115
lines changed

pp_sketch/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,4 +404,4 @@ def main():
404404
sys.exit(0)
405405

406406
if __name__ == "__main__":
407-
main()
407+
main()

pp_sketch/matrix.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88

99
import pp_sketchlib
1010

11-
def sparsify(distMat, cutoff, kNN, threads):
12-
sparse_coordinates = pp_sketchlib.sparsifyDists(distMat,
13-
distCutoff=cutoff,
14-
kNN=kNN)
11+
def sparsify(distMat, cutoff, threads):
12+
sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat,
13+
distCutoff=cutoff,
14+
num_threads=threads)
1515
sparse_scipy = ijv_to_coo(sparse_coordinates, distMat.shape, np.float32)
1616

1717
# Mirror to fill in lower triangle

src/dist/matrix.hpp

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,23 @@
77
#pragma once
88

99
#include <algorithm>
10-
#include <numeric>
11-
#include <vector>
12-
#include <cstdint>
1310
#include <cstddef>
11+
#include <cstdint>
12+
#include <numeric>
1413
#include <string>
14+
#include <vector>
1515

1616
#include <Eigen/Dense>
1717

18-
#include "matrix_types.hpp"
1918
#include "matrix_idx.hpp"
19+
#include "matrix_types.hpp"
2020

2121
// This type not used in any nvcc code
22-
using NumpyMatrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
22+
using NumpyMatrix =
23+
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
2324

24-
//https://stackoverflow.com/a/12399290
25-
template <typename T>
26-
std::vector<long> sort_indexes(const T &v)
27-
{
25+
// https://stackoverflow.com/a/12399290
26+
template <typename T> std::vector<long> sort_indexes(const T &v) {
2827

2928
// initialize original index locations
3029
std::vector<long> idx(v.size());
@@ -44,6 +43,6 @@ NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,
4443
Eigen::VectorXf square_to_long(const NumpyMatrix &squareDists,
4544
const unsigned int num_threads);
4645

47-
sparse_coo sparsify_dists(const NumpyMatrix &denseDists,
48-
const float distCutoff,
49-
const unsigned long int kNN);
46+
sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists,
47+
const float distCutoff,
48+
const size_t num_threads);

src/dist/matrix_ops.cpp

Lines changed: 38 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@
1010
#include <omp.h>
1111
#include <string>
1212
#include <vector>
13-
13+
#include <iostream>
1414
#include "api.hpp"
1515

16-
const float epsilon = 1E-10;
17-
1816
// Prototypes for cppying functions run in threads
1917
void square_block(const Eigen::VectorXf &longDists, NumpyMatrix &squareMatrix,
2018
const size_t n_samples, const size_t start,
@@ -25,55 +23,49 @@ void rectangle_block(const Eigen::VectorXf &longDists,
2523
const size_t nqqSamples, const size_t start,
2624
const size_t max_elems);
2725

28-
sparse_coo sparsify_dists(const NumpyMatrix &denseDists, const float distCutoff,
29-
const unsigned long int kNN) {
30-
if (kNN > 0 && distCutoff > 0) {
31-
throw std::runtime_error("Specify only one of kNN or distCutoff");
32-
} else if (kNN < 1 && distCutoff < 0) {
33-
throw std::runtime_error("kNN must be > 1 or distCutoff > 0");
26+
template <typename T>
27+
std::vector<T> combine_vectors(const std::vector<std::vector<T>> &vec,
28+
const size_t len) {
29+
std::vector<T> all(len);
30+
auto all_it = all.begin();
31+
for (size_t i = 0; i < vec.size(); ++i) {
32+
std::copy(vec[i].cbegin(), vec[i].cend(), all_it);
33+
all_it += vec[i].size();
34+
}
35+
return all;
36+
}
37+
38+
sparse_coo sparsify_dists_by_threshold(const NumpyMatrix &denseDists,
39+
const float distCutoff,
40+
const size_t num_threads) {
41+
42+
if (distCutoff < 0) {
43+
throw std::runtime_error("Distance threshold must be at least 0");
3444
}
3545

46+
// Parallelisation parameter
47+
size_t len = 0;
48+
3649
// ijv vectors
37-
std::vector<float> dists;
38-
std::vector<long> i_vec;
39-
std::vector<long> j_vec;
40-
if (distCutoff > 0) {
41-
for (long i = 0; i < denseDists.rows(); i++) {
42-
for (long j = i + 1; j < denseDists.cols(); j++) {
43-
if (denseDists(i, j) < distCutoff) {
44-
dists.push_back(denseDists(i, j));
45-
i_vec.push_back(i);
46-
j_vec.push_back(j);
47-
}
48-
}
49-
}
50-
} else if (kNN >= 1) {
51-
// Only add the k nearest (unique) neighbours
52-
// May be >k if repeats, often zeros
53-
for (long i = 0; i < denseDists.rows(); i++) {
54-
unsigned long unique_neighbors = 0;
55-
float prev_value = -1;
56-
for (auto j : sort_indexes(denseDists.row(i))) {
57-
if (j == i) {
58-
continue; // Ignore diagonal which will always be one of the closest
59-
}
60-
bool new_val = abs(denseDists(i, j) - prev_value) < epsilon;
61-
if (unique_neighbors < kNN || new_val) {
62-
dists.push_back(denseDists(i, j));
63-
i_vec.push_back(i);
64-
j_vec.push_back(j);
65-
if (!new_val) {
66-
unique_neighbors++;
67-
prev_value = denseDists(i, j);
68-
}
69-
} else {
70-
break;
71-
}
50+
long num_samples = denseDists.rows();
51+
std::vector<std::vector<float>> dists(num_samples);
52+
std::vector<std::vector<long>> i_vec(num_samples);
53+
std::vector<std::vector<long>> j_vec(num_samples);
54+
#pragma omp parallel for schedule(static) num_threads(num_threads) reduction(+:len)
55+
for (long i = 0; i < num_samples; i++) {
56+
for (long j = i + 1; j < denseDists.cols(); j++) {
57+
if (denseDists(i, j) < distCutoff) {
58+
dists[i].push_back(denseDists(i, j));
59+
i_vec[i].push_back(i);
60+
j_vec[i].push_back(j);
7261
}
7362
}
63+
len += i_vec[i].size();
7464
}
75-
76-
return (std::make_tuple(i_vec, j_vec, dists));
65+
std::vector<float> dists_all = combine_vectors(dists, len);
66+
std::vector<long> i_vec_all = combine_vectors(i_vec, len);
67+
std::vector<long> j_vec_all = combine_vectors(j_vec, len);
68+
return (std::make_tuple(i_vec_all, j_vec_all, dists_all));
7769
}
7870

7971
NumpyMatrix long_to_square(const Eigen::VectorXf &rrDists,

src/sketchlib_bindings.cpp

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,13 @@ NumpyMatrix queryDatabase(const std::string &ref_db_name,
137137
}
138138

139139
sparse_coo
140-
querySelfSparse(const std::string &ref_db_name, const std::vector<std::string> &ref_names,
141-
std::vector<size_t> kmer_lengths, const bool random_correct = true,
142-
const bool jaccard = false, const unsigned long int kNN = 0,
143-
const float dist_cutoff = 0.0f, const size_t dist_col = 0,
144-
const size_t num_threads = 1, const bool use_gpu = false,
145-
const int device_id = 0) {
140+
querySelfSparse(const std::string &ref_db_name,
141+
const std::vector<std::string> &ref_names,
142+
std::vector<size_t> kmer_lengths,
143+
const bool random_correct = true, const bool jaccard = false,
144+
const unsigned long int kNN = 0, const float dist_cutoff = 0.0f,
145+
const size_t dist_col = 0, const size_t num_threads = 1,
146+
const bool use_gpu = false, const int device_id = 0) {
146147
if (use_gpu && (jaccard || kmer_lengths.size() < 2 || dist_col > 1)) {
147148
throw std::runtime_error(
148149
"Extracting Jaccard distances not supported on GPU");
@@ -160,26 +161,28 @@ querySelfSparse(const std::string &ref_db_name, const std::vector<std::string> &
160161
sparse_coo sparse_dists;
161162
if (dist_cutoff > 0) {
162163
NumpyMatrix dists = queryDatabase(ref_db_name, ref_db_name, ref_names,
163-
ref_names, kmer_lengths, random_correct,
164-
false, num_threads, use_gpu, device_id);
164+
ref_names, kmer_lengths, random_correct,
165+
false, num_threads, use_gpu, device_id);
165166

166167
Eigen::VectorXf dummy_query_ref;
167168
Eigen::VectorXf dummy_query_query;
168169
NumpyMatrix long_form = long_to_square(dists.col(dist_col), dummy_query_ref,
169-
dummy_query_query, num_threads);
170-
sparse_dists = sparsify_dists(long_form, dist_cutoff, kNN);
170+
dummy_query_query, num_threads);
171+
sparse_dists =
172+
sparsify_dists_by_threshold(long_form, dist_cutoff, num_threads);
171173
} else {
172174
#ifdef GPU_AVAILABLE
173175
if (use_gpu) {
174-
sparse_dists = query_db_sparse_cuda(ref_sketches, kmer_lengths, random,
175-
kNN, dist_col, device_id, num_threads);
176+
sparse_dists =
177+
query_db_sparse_cuda(ref_sketches, kmer_lengths, random, kNN,
178+
dist_col, device_id, num_threads);
176179
} else {
177180
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random,
178-
jaccard, kNN, dist_col, num_threads);
181+
jaccard, kNN, dist_col, num_threads);
179182
}
180183
#else
181-
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random,
182-
jaccard, kNN, dist_col, num_threads);
184+
sparse_dists = query_db_sparse(ref_sketches, kmer_lengths, random, jaccard,
185+
kNN, dist_col, num_threads);
183186
#endif
184187
}
185188

@@ -216,9 +219,11 @@ double jaccardDist(const std::string &db_name, const std::string &sample1,
216219
}
217220

218221
// Wrapper which makes a ref to the python/numpy array
219-
sparse_coo sparsifyDists(const Eigen::Ref<NumpyMatrix> &denseDists,
220-
const float distCutoff, const unsigned long int kNN) {
221-
sparse_coo coo_idx = sparsify_dists(denseDists, distCutoff, kNN);
222+
sparse_coo sparsifyDistsByThreshold(const Eigen::Ref<NumpyMatrix> &denseDists,
223+
const float distCutoff,
224+
const size_t num_threads) {
225+
sparse_coo coo_idx =
226+
sparsify_dists_by_threshold(denseDists, distCutoff, num_threads);
222227
return coo_idx;
223228
}
224229

@@ -245,12 +250,11 @@ PYBIND11_MODULE(pp_sketchlib, m) {
245250
m.def("querySelfSparse", &querySelfSparse,
246251
py::return_value_policy::reference_internal,
247252
"Find self-self distances between sketches; return a sparse matrix",
248-
py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"), py::arg("random_correct") = true,
249-
py::arg("jaccard") = false,
253+
py::arg("ref_db_name"), py::arg("rList"), py::arg("klist"),
254+
py::arg("random_correct") = true, py::arg("jaccard") = false,
250255
py::arg("kNN") = 0, py::arg("dist_cutoff") = 0.0f,
251-
py::arg("dist_col") = 0,
252-
py::arg("num_threads") = 1, py::arg("use_gpu") = false,
253-
py::arg("device_id") = 0);
256+
py::arg("dist_col") = 0, py::arg("num_threads") = 1,
257+
py::arg("use_gpu") = false, py::arg("device_id") = 0);
254258

255259
m.def("addRandom", &addRandomToDb,
256260
"Add random match chances into older databases", py::arg("db_name"),
@@ -279,11 +283,11 @@ PYBIND11_MODULE(pp_sketchlib, m) {
279283
py::arg("query_ref_distVec").noconvert(),
280284
py::arg("query_query_distVec").noconvert(), py::arg("num_threads") = 1);
281285

282-
m.def("sparsifyDists", &sparsifyDists,
286+
m.def("sparsifyDistsByThreshold", &sparsifyDistsByThreshold,
283287
py::return_value_policy::reference_internal,
284288
"Transform all distances into a sparse matrix",
285289
py::arg("distMat").noconvert(), py::arg("distCutoff") = 0,
286-
py::arg("kNN") = 0);
290+
py::arg("num_threads") = 1);
287291

288292
m.attr("version") = VERSION_INFO;
289293
m.attr("sketchVersion") = SKETCH_VERSION;

test/test-dists.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,36 @@ def iterDistRows(refSeqs, querySeqs, self=True):
2929
for ref in refSeqs:
3030
yield(ref, query)
3131

32+
def get_kNN_sparse_tuple(square_core_mat,kNN):
33+
i_vec = []
34+
j_vec = []
35+
dist_vec = []
36+
for i in range(0,square_core_mat.shape[0]):
37+
sorted_indices = np.argsort(square_core_mat[i,:])
38+
j_index = 0
39+
neighbour_count = 0
40+
while neighbour_count < kNN:
41+
if (sorted_indices[j_index] != i):
42+
i_vec.append(i)
43+
j_vec.append(sorted_indices[j_index])
44+
dist_vec.append(square_core_mat[i,sorted_indices[j_index]])
45+
neighbour_count = neighbour_count + 1
46+
j_index = j_index + 1
47+
sparse_knn = (i_vec,j_vec,dist_vec)
48+
return sparse_knn
49+
50+
def get_threshold_sparse_tuple(square_core_mat,threshold):
51+
i_vec = []
52+
j_vec = []
53+
dist_vec = []
54+
for i in range(0,square_core_mat.shape[0]):
55+
for j in range(i,square_core_mat.shape[1]):
56+
if square_core_mat[i,j] < threshold and i != j:
57+
i_vec.append(i)
58+
j_vec.append(j)
59+
dist_vec.append(square_core_mat[i,j])
60+
sparse_threshold = (i_vec,j_vec,dist_vec)
61+
return sparse_threshold
3262

3363
description = 'Run poppunk sketching/distances'
3464
parser = argparse.ArgumentParser(description=description,
@@ -110,7 +140,7 @@ def iterDistRows(refSeqs, querySeqs, self=True):
110140
rList=rList,
111141
klist=db_kmers,
112142
kNN=kNN)
113-
sparse_knn = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=0, kNN=kNN)
143+
sparse_knn = get_kNN_sparse_tuple(square_core_mat,kNN)
114144
if (sparseDistMat[0] != sparse_knn[0] or
115145
sparseDistMat[1] != sparse_knn[1]):
116146
sys.stderr.write("Sparse distances (kNN) mismatching\n")
@@ -126,7 +156,7 @@ def iterDistRows(refSeqs, querySeqs, self=True):
126156
rList=rList,
127157
klist=db_kmers,
128158
dist_cutoff=cutoff)
129-
sparse_threshold = pp_sketchlib.sparsifyDists(distMat=square_core_mat, distCutoff=cutoff, kNN=0)
159+
sparse_threshold = get_threshold_sparse_tuple(square_core_mat,cutoff)
130160
if (sparseDistMat[0] != sparse_threshold[0] or
131161
sparseDistMat[1] != sparse_threshold[1]):
132162
sys.stderr.write("Sparse distances (cutoff) mismatching\n")

test/test-matrix.py

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
from pp_sketch.matrix import sparsify
1010
except ImportError as e:
1111
from scipy.sparse import coo_matrix
12-
def sparsify(distMat, cutoff, kNN, threads):
13-
sparse_coordinates = pp_sketchlib.sparsifyDists(distMat=distMat,
14-
distCutoff=cutoff,
15-
kNN=kNN)
12+
def sparsify(distMat, cutoff, threads):
13+
sparse_coordinates = pp_sketchlib.sparsifyDistsByThreshold(distMat=distMat,
14+
distCutoff=cutoff,
15+
num_threads=threads)
1616
sparse_scipy = coo_matrix((sparse_coordinates[2],
1717
(sparse_coordinates[0], sparse_coordinates[1])),
1818
shape=distMat.shape,
@@ -60,28 +60,8 @@ def check_res(res, expected):
6060
check_res(pp_sketchlib.squareToLong(distMat=square1_res, num_threads=2), rr_mat)
6161

6262
# sparsification
63-
sparse1 = sparsify(square2_res, cutoff=5, kNN=0, threads=2)
63+
sparse1 = sparsify(square2_res, cutoff=5, threads=2)
6464

6565
sparse1_res = square2_res.copy()
6666
sparse1_res[sparse1_res >= 5] = 0
6767
check_res(sparse1.todense(), sparse1_res)
68-
69-
kNN = 2
70-
sparse2 = sparsify(square2_res, cutoff=0, kNN=kNN, threads=2)
71-
72-
sparse2_res = square2_res.copy()
73-
row_sort = np.argsort(sparse2_res, axis=1)
74-
for i, row in enumerate(sparse2_res):
75-
neighbours = 0
76-
prev_val = 0
77-
for j in row_sort[i, :]:
78-
if i == j or row[j] == prev_val:
79-
continue
80-
else:
81-
prev_val = row[j]
82-
if neighbours >= kNN:
83-
sparse2_res[i, j] = 0
84-
else:
85-
neighbours += 1
86-
87-
check_res(sparse2.todense(), sparse2_res)

0 commit comments

Comments
 (0)