Skip to content

Commit adf033f

Browse files
authored
Improve efficiency of sparse queries (#94)
* Fix for gcc-13 * Add personalised makefile * Replace sort with priority queue * Missing semicolon * Type fixes for SparseDist * Add debug flag to makefile * Debug mode for Makefiles * Revert "Debug mode for Makefiles" This reverts commit 266742b. * Set debug another way * Allow debug to be turned off * Put the sort in the right place (after all dists done) * Replace hash map * Use C++17 * Correct header name * Typo in map header name * Replace class with struct * Try to fix order in sparse loop Definitely wrong! Was confused by brace levels * Change order to match with tests * Pass C++17 onto nvcc * Correct c++17 definition for CUDA * Add license for hashmap * Rename local build env var
1 parent d267077 commit adf033f

20 files changed

+2279
-2134
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cmake_minimum_required(VERSION 3.16)
22
project(pp_sketchlib)
3-
set(CMAKE_CXX_STANDARD 14)
3+
set(CMAKE_CXX_STANDARD 17)
44

55
if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")
66
cmake_policy(SET CMP0104 OLD) # Can't get CUDA_ARCHITECTURES to work with NEW

LICENSE_unordered_dense

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
MIT License
2+
3+
Copyright (c) 2022 Martin Leitner-Ankerl
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.
22+

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ You can set an environment variable `SKETCHLIB_INSTALL` to affect `python setup.
334334
- Empty: uses cmake
335335
- `conda`: sets library location to the conda environment, and uses `src/Makefile` (used to be used in conda-forge recipe)
336336
- `azure`: Uses `src/Makefile`
337+
- `local`: Uses `src/Makefile_fedora38`
337338

338339
### cmake
339340
Now requires v3.19. If nvcc version is 11.0 or higher, sm8.6 with device link time optimisation will be used.

setup.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def build_extension(self, ext):
7878
env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
7979
self.distribution.get_version())
8080

81+
8182
if not os.path.exists(self.build_temp):
8283
os.makedirs(self.build_temp)
8384

@@ -88,6 +89,12 @@ def build_extension(self, ext):
8889
elif target == 'azure':
8990
subprocess.check_call(['make', 'python'], cwd=ext.sourcedir + '/src', env=env)
9091
subprocess.check_call(['make', 'install_python', 'PYTHON_LIB_PATH=' + extdir], cwd=ext.sourcedir + '/src', env=env)
92+
elif target == 'local':
93+
debug = "DEBUG="
94+
if cfg == 'Debug':
95+
debug = "DEBUG=1"
96+
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'python', debug], cwd=ext.sourcedir + '/src', env=env)
97+
subprocess.check_call(['make', '-f', 'Makefile_fedora38', 'install_python', 'PYTHON_LIB_PATH=' + extdir, debug], cwd=ext.sourcedir + '/src', env=env)
9198
else:
9299
subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
93100
subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)

src/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CXXFLAGS+=-Wall -Wextra -std=c++14 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
1+
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
22
ifdef DEBUG
33
CXXFLAGS+= -O0 -g
44
CUDAFLAGS = -g -G
@@ -7,6 +7,7 @@ else ifdef PROFILE
77
CUDAFLAGS = -O2 -pg -lineinfo
88
else
99
CXXFLAGS+= -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
10+
CUDAFLAGS = -O3
1011
endif
1112

1213
UNAME_S := $(shell uname -s)
@@ -29,7 +30,7 @@ LDFLAGS+= -L$(LIBLOC)/lib
2930
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)
3031

3132
CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
32-
CUDAFLAGS +=-Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
33+
CUDAFLAGS +=-std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
3334
ifdef GPU
3435
CXXFLAGS += -DGPU_AVAILABLE
3536
CUDAFLAGS += -gencode arch=compute_86,code=sm_86

src/Makefile_fedora38

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
CXX=gcc-11
2+
CC=gcc-11
3+
CFLAGS+=-Wall -Wextra -fPIC
4+
CXXFLAGS+=-Wall -Wextra -std=c++17 -fopenmp -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fPIC
5+
ifdef DEBUG
6+
CXXFLAGS+= -O0 -g
7+
CUDAFLAGS = -g -G
8+
else ifdef PROFILE
9+
CXXFLAGS+= -O2 -g -flto -fno-fat-lto-objects -fvisibility=hidden
10+
CUDAFLAGS = -O2 -pg -lineinfo
11+
else
12+
CXXFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
13+
CFLAGS+= -march=native -O3 -flto -fno-fat-lto-objects -fvisibility=hidden
14+
CUDAFLAGS+= -O3
15+
endif
16+
17+
UNAME_S := $(shell uname -s)
18+
LIBLOC = ${CONDA_PREFIX}
19+
LDLIBS = -lz -lhdf5_cpp -lhdf5 -lopenblas -lgomp
20+
ifeq ($(UNAME_S),Linux)
21+
CXXFLAGS+= -m64
22+
ifdef PROFILE
23+
CXXFLAGS+= -Wl,--compress-debug-sections=none
24+
endif
25+
LDLIBS+= -lpthread -lgfortran -lm -ldl -lrt
26+
LDFLAGS=-Wl,-as-needed
27+
endif
28+
ifeq ($(UNAME_S),Darwin)
29+
LDLIBS+= -pthread
30+
endif
31+
32+
CPPFLAGS+=-I"/home/linuxbrew/.linuxbrew/include" -I"." -I"../vendor/highfive/include" -I$(LIBLOC)/include -I$(LIBLOC)/include/eigen3
33+
LDFLAGS+= -L$(LIBLOC)/lib -L"/home/linuxbrew/.linuxbrew/lib" -L/usr/local/cuda-12.3/lib64
34+
CUDA_LDLIBS=-lcudadevrt -lcudart_static $(LDLIBS)
35+
36+
CUDA_LDFLAGS =-L$(LIBLOC)/lib -L${CUDA_HOME}/targets/x86_64-linux/lib/stubs -L${CUDA_HOME}/targets/x86_64-linux/lib
37+
CUDAFLAGS +=-ccbin /home/linuxbrew/.linuxbrew/bin/g++-11 -std=c++17 -Xcompiler -fPIC --cudart static --relocatable-device-code=true --expt-relaxed-constexpr -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75
38+
ifdef GPU
39+
CXXFLAGS += -DGPU_AVAILABLE
40+
CUDAFLAGS += -gencode arch=compute_86,code=sm_86
41+
CUDA_LDFLAGS += -L/usr/local/cuda-12.3/lib64
42+
endif
43+
44+
PYTHON_LIB = pp_sketchlib$(shell python3-config --extension-suffix)
45+
46+
# python specific options
47+
python: CPPFLAGS += -DGPU_AVAILABLE -DPYTHON_EXT -DNDEBUG -Dpp_sketchlib_EXPORTS $(shell python3 -m pybind11 --includes)
48+
49+
PROGRAMS=sketch_test matrix_test read_test gpu_dist_test
50+
51+
SKETCH_OBJS=dist/dist.o dist/matrix_ops.o reference.o sketch/seqio.o sketch/sketch.o database/database.o sketch/countmin.o api.o dist/linear_regression.o random/rng.o random/random_match.o random/kmeans/KMeansRexCore.o random/kmeans/mersenneTwister2002.o
52+
GPU_SKETCH_OBJS=gpu/gpu_api.o
53+
CUDA_OBJS=gpu/dist.cu.o gpu/sketch.cu.o gpu/device_reads.cu.o gpu/gpu_countmin.cu.o gpu/device_memory.cu.o
54+
55+
# web specific options
56+
web: CXX = em++
57+
# optimised compile options
58+
# NB turn exceptions back on for testing
59+
# NB `--closure 1` can be used to reduce size of js file (this minifies variable names!)
60+
web: CXXFLAGS = -O3 -s ASSERTIONS=1 \
61+
-DNOEXCEPT \
62+
-DJSON_NOEXCEPTION \
63+
-s DISABLE_EXCEPTION_CATCHING=1 \
64+
-fno-exceptions \
65+
-flto --bind -s STRICT=1 \
66+
-s ALLOW_MEMORY_GROWTH=1 \
67+
-s USE_ZLIB=1 \
68+
-s MODULARIZE=1 \
69+
-s "EXPORTED_FUNCTIONS=['_malloc']" \
70+
-s 'EXPORTED_RUNTIME_METHODS=["FS"]' \
71+
-s EXPORT_NAME=WebSketch \
72+
-Wall -Wextra -std=c++14
73+
web: CPPFLAGS += -DWEB_SKETCH
74+
web: LDFLAGS = -lnodefs.js -lworkerfs.js
75+
76+
WEB_OUT=web/web_sketch
77+
WEB_OBJS=${WEB_OUT}.js ${WEB_OUT}.html ${WEB_OUT}.wasm
78+
79+
web: web/web_sketch.o sketch/seqio.o sketch/sketch.o sketch/countmin.o
80+
$(LINK.cpp) $^ -o ${WEB_OUT}.js
81+
sed -i.old '1s;^;\/* eslint-disable *\/;' ${WEB_OUT}.js
82+
83+
all: $(PROGRAMS)
84+
85+
clean:
86+
$(RM) $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) $(WEB_OBJS) *.o *.so version.h ~* $(PROGRAMS)
87+
88+
install: all
89+
install -d $(BINDIR)
90+
install $(PROGRAMS) $(BINDIR)
91+
92+
sketch_test: $(SKETCH_OBJS) test/main.o
93+
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ -o $@ $(LDLIBS)
94+
95+
matrix_test: $(SKETCH_OBJS) test/matrix_test.o
96+
$(LINK.cpp) $^ -o $@ $(LDLIBS)
97+
98+
read_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/read_test.o
99+
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
100+
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)
101+
102+
gpu_dist_test: $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) test/gpu_dist_test.o
103+
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
104+
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) $^ device_link.o -o $@ $(CUDA_LDLIBS)
105+
106+
version.h:
107+
cat sketch/*.cpp sketch/*.hpp gpu/sketch.cu | openssl sha1 | awk '{print "#define SKETCH_VERSION \"" $$2 "\""}' > version.h
108+
109+
database/database.o: version.h
110+
111+
web/web_sketch.o: version.h
112+
113+
python: $(PYTHON_LIB)
114+
115+
$(PYTHON_LIB): $(SKETCH_OBJS) $(GPU_SKETCH_OBJS) $(CUDA_OBJS) sketchlib_bindings.o
116+
nvcc $(CUDAFLAGS) $(CUDA_LDFLAGS) -Wno-deprecated-gpu-targets -shared -dlink $^ -o device_link.o -Xnvlink $(CUDA_LDLIBS)
117+
$(LINK.cpp) $(CUDA_LDFLAGS) $(LDFLAGS) -shared $^ device_link.o -o $(PYTHON_LIB) $(CUDA_LDLIBS)
118+
119+
install_python: python
120+
install -d $(PYTHON_LIB_PATH)
121+
install $(PYTHON_LIB) $(PYTHON_LIB_PATH)
122+
123+
gpu/dist.cu.o:
124+
echo ${CUDAFLAGS}
125+
echo ${CPPFLAGS}
126+
echo ${CXXFLAGS}
127+
echo ${CFLAGS}
128+
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/dist.cu -o $@
129+
130+
gpu/sketch.cu.o:
131+
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/sketch.cu -o $@
132+
133+
gpu/device_memory.cu.o:
134+
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_memory.cu -o $@
135+
136+
gpu/device_reads.cu.o:
137+
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/device_reads.cu -o $@
138+
139+
gpu/gpu_countmin.cu.o:
140+
nvcc $(CUDAFLAGS) $(CPPFLAGS) -DGPU_AVAILABLE -x cu -c gpu/gpu_countmin.cu -o $@
141+
142+
.PHONY: all clean install python install_python web

src/api.cpp

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <algorithm>
88
#include <limits>
9+
#include <queue>
910

1011
#include <H5Cpp.h>
1112
#include <omp.h>
@@ -314,6 +315,20 @@ void check_sparse_inputs(const std::vector<Reference> &ref_sketches,
314315
}
315316
}
316317

318+
// Struct that allows sorting by dist but also keeping index
319+
struct SparseDist {
320+
float dist;
321+
long j;
322+
};
323+
bool operator<(SparseDist const &a, SparseDist const &b)
324+
{
325+
return a.dist < b.dist;
326+
}
327+
bool operator==(SparseDist const &a, SparseDist const &b)
328+
{
329+
return a.dist == b.dist;
330+
}
331+
317332
sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
318333
const std::vector<size_t> &kmer_lengths,
319334
RandomMC &random_chance, const bool jaccard,
@@ -344,27 +359,35 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
344359
Eigen::MatrixXf kmer_mat = kmer2mat(kmer_lengths);
345360
#pragma omp parallel for schedule(static) num_threads(num_threads) shared(progress)
346361
for (size_t i = 0; i < ref_sketches.size(); i++) {
347-
std::vector<float> row_dists(ref_sketches.size());
362+
// Use a priority queue to efficiently track the smallest N dists
363+
std::priority_queue<SparseDist> min_dists;
348364
if (!interrupt) {
349365
for (size_t j = 0; j < ref_sketches.size(); j++) {
366+
float row_dist = std::numeric_limits<float>::infinity();
350367
if (i != j) {
351368
if (jaccard) {
352369
// Need 1-J here to sort correctly
353-
row_dists[j] = 1.0f - ref_sketches[i].jaccard_dist(
370+
row_dist = 1.0f - ref_sketches[i].jaccard_dist(
354371
ref_sketches[j], kmer_lengths[dist_col], random_chance);
355372
} else {
356373
float core, acc;
357374
std::tie(core, acc) =
358375
ref_sketches[i].core_acc_dist<RandomMC>(
359376
ref_sketches[j], kmer_mat, random_chance);
360377
if (dist_col == 0) {
361-
row_dists[j] = core;
378+
row_dist = core;
362379
} else {
363-
row_dists[j] = acc;
380+
row_dist = acc;
364381
}
365382
}
366-
} else {
367-
row_dists[j] = std::numeric_limits<float>::infinity();
383+
}
384+
// Add dist if it is in the smallest k
385+
if (min_dists.size() < kNN || row_dist < min_dists.top().dist) {
386+
SparseDist new_min = {row_dist, j};
387+
min_dists.push(new_min);
388+
if (min_dists.size() > kNN) {
389+
min_dists.pop();
390+
}
368391
}
369392
if ((i * ref_sketches.size() + j) % update_every == 0) {
370393
#pragma omp critical
@@ -376,16 +399,17 @@ sparse_coo query_db_sparse(std::vector<Reference> &ref_sketches,
376399
}
377400
}
378401
}
379-
long offset = i * kNN;
380-
std::vector<long> ordered_dists = sort_indexes(row_dists);
381-
std::fill_n(i_vec.begin() + offset, kNN, i);
382-
// std::copy_n(ordered_dists.begin(), kNN, j_vec.begin() + offset);
383-
384-
for (int k = 0; k < kNN; ++k) {
385-
j_vec[offset + k] = ordered_dists[k];
386-
dists[offset + k] = row_dists[ordered_dists[k]];
387-
}
402+
}
388403

404+
// For each sample/row/i, fill the ijk vectors
405+
// This goes 'backwards' for compatibility with numpy (so dists are ascending)
406+
long offset = i * kNN;
407+
std::fill_n(i_vec.begin() + offset, kNN, i);
408+
for (int k = kNN - 1; k >= 0; --k) {
409+
SparseDist entry = min_dists.top();
410+
j_vec[offset + k] = entry.j;
411+
dists[offset + k] = entry.dist;
412+
min_dists.pop();
389413
}
390414
}
391415
}

src/database/database.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
#include "hdf5_funcs.hpp"
1313
#include "random/random_match.hpp"
1414

15-
#include "robin_hood.h"
16-
1715
// const int deflate_level = 9;
1816

1917
// Helper function prototypes
@@ -200,9 +198,9 @@ RandomMC Database::load_random(const bool use_rc_default) {
200198
HighFive::Group random_group = _h5_file.getGroup("/random");
201199

202200
// Flattened hashes
203-
robin_hood::unordered_node_map<std::string, uint16_t> cluster_table =
201+
ankerl::unordered_dense::map<std::string, uint16_t> cluster_table =
204202
load_hash<std::string, uint16_t>(random_group, "table");
205-
robin_hood::unordered_node_map<size_t, NumpyMatrix> matches =
203+
ankerl::unordered_dense::map<size_t, NumpyMatrix> matches =
206204
load_hash<size_t, NumpyMatrix>(random_group, "matches");
207205

208206
// Centroid matrix

src/database/database.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include <cstring>
1313
#include <vector>
1414
#include <string>
15-
#include "robin_hood.h"
15+
#include "unordered_dense.hpp"
1616

1717
#include <highfive/H5File.hpp>
1818

0 commit comments

Comments
 (0)