Skip to content

Commit 0752671

Browse files
authored
Merge pull request #37 from UM-Applied-Algorithms-Lab/version-0.2.1
Version 0.2.1
2 parents ad5ae99 + 5c3069b commit 0752671

File tree

6 files changed

+151
-184
lines changed

6 files changed

+151
-184
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
Generates an Fm-Index of a given biological sequence text (Fasta or Fastq file), and implements Locate() and Search() functionalities.
55

6-
AWRY is a port of a state-of-the-art, fastest in its class FM-index implementation (https://doi.org/10.1186/s13015-021-00204-6). AWRY supports parallelized searching, with parallel_count() and parallel_locate() functions.
6+
AWRY is a port of a state-of-the-art, fastest in its class FM-index implementation (https://doi.org/10.1186/s13015-021-00204-6). AWRY supports parallelized searching, with parallel_count() and parallel_locate() functions. As AWRY is powered by custom SIMD operations, the library currently supports x86_64 and aarch64 machines with AVX2 and ARM Neon.
77

88
AWRY supports DNA, RNA, and protein alphabets, and is able to search at lightning speed by leveraging SIMD vectorization and multithreading over collections of queries. These alphabets are case-insensitive- `A` and `a` are considered the same symbol under the hood.
99

src/bwt.rs

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
2+
13
use crate::{
24
alphabet::{Symbol, SymbolAlphabet},
35
search::SearchPtr,
4-
simd_instructions::{Vec256, SimdVec256},
6+
simd_instructions::{masked_popcount, simd_and, simd_andnot, simd_or, Vec256},
57
};
68
use mem_dbg::MemSize;
79
use serde::{Deserialize, Serialize};
@@ -112,21 +114,21 @@ impl NucleotideBwtBlock {
112114
pub (crate) fn global_occurrence(&self, local_query_position: u64, symbol: &Symbol) -> u64 {
113115
unsafe{
114116
let milestone_count = self.milestone(&symbol);
115-
let vec0 = SimdVec256::from(*self.bit_vectors.get_unchecked(0));
116-
let vec1 = SimdVec256::from(*self.bit_vectors.get_unchecked(1));
117-
let vec2 = SimdVec256::from(*self.bit_vectors.get_unchecked(2));
117+
let vec0 = self.bit_vectors.get_unchecked(0).to_simd();
118+
let vec1 = self.bit_vectors.get_unchecked(1).to_simd();
119+
let vec2 = self.bit_vectors.get_unchecked(2).to_simd();
118120
let occurrence_vector = match &symbol.index() {
119-
1 => vec2.and(&vec1), //A: 0b110
120-
2 => vec2.and(&vec0), //C: 0b101
121-
3 => vec1.and(&vec0), //G: 0b011
122-
4 => vec2.andnot(&vec0.andnot(&vec1)), //N: 0b010
123-
5 => vec2.andnot(&vec1.andnot(&vec0)), //T: 0b001
121+
1 => simd_and(vec1, vec2), //A: 0b110
122+
2 => simd_and(vec0, vec2), //C: 0b101
123+
3 => simd_and(vec0, vec1), //G: 0b011
124+
4 => simd_andnot(vec2, simd_andnot(vec0, vec1)), //N: 0b010
125+
5 => simd_andnot(vec2, simd_andnot(vec1, vec0)), //T: 0b001
124126
_ => {
125127
panic!("illegal letter index given in global occurrence function symbol idx given: {}", symbol.index());
126128
} //assume every other character is an N, since it's illegal to search for a sentinel
127129
};
128130

129-
let popcount = occurrence_vector.masked_popcount(local_query_position);
131+
let popcount = masked_popcount(occurrence_vector, local_query_position);
130132

131133
return milestone_count + popcount as u64;
132134
}
@@ -225,44 +227,44 @@ impl AminoBwtBlock {
225227
/// The occurrence function uses the milestone value and the masked occurrenc vector to
226228
/// determine how many instances of the given character were before this position.
227229
#[inline]
228-
pub (crate) fn global_occurrence(&self, local_query_position: SearchPtr, symbol: &Symbol) -> SearchPtr {
230+
pub (crate) fn global_occurrence(&self, local_query_position: SearchPtr, symbol: &Symbol) -> SearchPtr {
229231
let milestone_count = self.milestone(symbol);
230232

231233
unsafe{
232-
let vec0 = SimdVec256::from(*self.bit_vectors.get_unchecked(0));
233-
let vec1 = SimdVec256::from(*self.bit_vectors.get_unchecked(1));
234-
let vec2 = SimdVec256::from(*self.bit_vectors.get_unchecked(2));
235-
let vec3 = SimdVec256::from(*self.bit_vectors.get_unchecked(3));
236-
let vec4 = SimdVec256::from(*self.bit_vectors.get_unchecked(4));
234+
let vec0 = self.bit_vectors.get_unchecked(0).to_simd();
235+
let vec1 = self.bit_vectors.get_unchecked(1).to_simd();
236+
let vec2 = self.bit_vectors.get_unchecked(2).to_simd();
237+
let vec3 = self.bit_vectors.get_unchecked(3).to_simd();
238+
let vec4 = self.bit_vectors.get_unchecked(4).to_simd();
237239
let occurrence_vector = match symbol.index() {
238-
1 => vec3.and(&vec4.andnot(&vec2)), //A: 0b01100
239-
2 => vec3.andnot(&vec2).and(&vec1.and(&vec0)), //C: 0b10111
240-
3 => vec1.and(&vec4.andnot(&vec0)), //D: 0b00011
241-
4 => vec4.andnot(&vec2.and(&vec1)), //E: 0b00110
242-
5 => vec0.andnot(&vec3).and(&vec2.and(&vec1)), //F: 0b11110
243-
6 => vec2.andnot(&vec0.andnot(&vec4)), //G: 0b11010
244-
7 => vec2.andnot(&vec3).and(&vec1.and(&vec0)), //H: 0b11011
245-
8 => vec2.andnot(&vec1.andnot(&vec4)), //I: 0b11001
246-
9 => vec3.andnot(&vec1.andnot(&vec4)), //K: 0b10101
247-
10 => vec1.andnot(&vec0.andnot(&vec4)), //L: 0b11100
248-
11 => vec1.andnot(&vec3).and(&vec2.and(&vec0)), //M: 0b11101
249-
12 => vec0.or(&vec1).andnot(&vec2.andnot(&vec3)), //N: 0b01000
250-
13 => vec3.and(&vec4.andnot(&vec0)), //P: 0b01001,
251-
14 => vec3.or(&vec1).andnot(&vec0.andnot(&vec2)), //Q: 0b00100
252-
15 => vec3.andnot(&vec2.andnot(&vec4)), //R: 0b10011
253-
16 => vec3.and(&vec4.andnot(&vec1)), //S: 0b01010
254-
17 => vec2.and(&vec4.andnot(&vec0)), //T: 0b00101
255-
18 => vec3.andnot(&vec0.andnot(&vec4)), //V: 0b10110
256-
19 => vec3.or(&vec2).andnot(&vec1.andnot(&vec0)), //W: 0b00001
257-
20 => vec3.and(&vec2).and(&vec1.and(&vec0)), //Ambiguity character X: 0b11111
258-
21 => vec0.or(&vec2).andnot(&vec3.andnot(&vec1)), //Y: 0b00010
240+
1 => simd_and(vec2,simd_andnot(vec4, vec3)), //A: 0b01100
241+
2 => simd_andnot(vec3, simd_and(simd_and(vec0, vec1), vec2)), //C: 0b10111
242+
3 => simd_andnot(vec4, simd_and(vec0, vec1)), //D: 0b00011
243+
4 => simd_andnot(vec4, simd_and(vec1, vec2)), //E: 0b00110
244+
5 => simd_andnot(vec0, simd_and(simd_and(vec1, vec2), vec3)), //F: 0b11110
245+
6 => simd_andnot(vec2, simd_andnot(vec0, vec4)), //G: 0b11010
246+
7 => simd_andnot(vec2, simd_and(vec0, simd_and(vec1, vec3))), //H: 0b11011
247+
8 => simd_andnot(vec2, simd_andnot(vec1, vec4)), //I: 0b11001
248+
9 => simd_andnot(vec1, simd_andnot(vec3, vec4)), //K: 0b10101
249+
10 => simd_andnot(vec1, simd_andnot(vec0, vec4)), //L: 0b11100
250+
11 => simd_andnot(vec1, simd_and(vec3, simd_and(vec2, vec0))), //M: 0b11101
251+
12 => simd_andnot(simd_or(vec0, vec1), simd_andnot(vec2, vec3)), //N: 0b01000
252+
13 => simd_and(vec3, simd_andnot(vec4, vec0)), //P: 0b01001,
253+
14 => simd_andnot(simd_or(vec0, vec1), simd_andnot(vec3, vec2)),//Q: 0b00100
254+
15 => simd_andnot(vec2, simd_andnot(vec3, vec4)), //R: 0b10011
255+
16 => simd_and(vec1, simd_andnot(vec4, vec3)), //S: 0b01010
256+
17 => simd_and(vec0, simd_andnot(vec4, vec2)), //T: 0b00101
257+
18 => simd_andnot(vec3, simd_andnot(vec0, vec4)), //V: 0b10110
258+
19 => simd_andnot(simd_or(vec1, vec2), simd_andnot(vec3, vec0)), //W: 0b00001
259+
20 => simd_and(simd_and(vec0, vec1), simd_and(vec2, vec3)), //Ambiguity character X: 0b11111
260+
21 => simd_andnot(simd_or(vec0, vec2), simd_andnot(vec3, vec1)), //Y: 0b00010
259261
// 0b00000 is sentinel, but since you can't search for sentinels, it is not included here.
260262
_ => {
261263
panic!("illegal letter index given in global occurrence function");
262264
}
263265
};
264266

265-
let popcount = occurrence_vector.masked_popcount(local_query_position);
267+
let popcount = masked_popcount(occurrence_vector, local_query_position);
266268

267269
return milestone_count + popcount as u64;
268270
}

src/fm_index.rs

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,9 @@ impl FmIndex {
390390
let mut search_range =
391391
SearchRange::new(self, Symbol::new_index(alphabet, final_query_index));
392392
for query_char in query.chars().rev().skip(1) {
393-
if !search_range.is_empty() {
393+
if search_range.is_empty() {
394+
break;
395+
} else {
394396
let query_symbol = Symbol::new_ascii(alphabet, query_char);
395397
search_range = self.update_range_with_symbol(search_range, query_symbol);
396398
}
@@ -404,10 +406,14 @@ impl FmIndex {
404406
.rev()
405407
.skip(self.kmer_lookup_table.kmer_len() as usize)
406408
{
407-
search_range = self.update_range_with_symbol(
408-
search_range,
409-
Symbol::new_ascii(self.alphabet(), query_char),
410-
);
409+
if search_range.is_empty() {
410+
break;
411+
} else {
412+
search_range = self.update_range_with_symbol(
413+
search_range,
414+
Symbol::new_ascii(self.alphabet(), query_char),
415+
);
416+
}
411417
}
412418

413419
return search_range;
@@ -429,9 +435,7 @@ impl FmIndex {
429435
/// println!("count: {}", count);
430436
/// }
431437
/// ```
432-
pub fn parallel_count<'a>(&self, queries: impl ParallelIterator<Item= &'a str>) -> Vec<u64>
433-
434-
{
438+
pub fn parallel_count<'a>(&self, queries: impl ParallelIterator<Item = &'a str>) -> Vec<u64> {
435439
queries
436440
.into_par_iter()
437441
.map(|query| self.count_string(&query))
@@ -457,9 +461,8 @@ impl FmIndex {
457461
/// ```
458462
pub fn parallel_locate<'a>(
459463
&self,
460-
queries: impl ParallelIterator<Item= &'a str>,
461-
) -> Vec<Vec<LocalizedSequencePosition>>
462-
{
464+
queries: impl ParallelIterator<Item = &'a str>,
465+
) -> Vec<Vec<LocalizedSequencePosition>> {
463466
queries
464467
.into_par_iter()
465468
.map(|query| self.locate_string(query))

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
pub mod alphabet;
23
pub mod bwt;
34
pub mod compressed_suffix_array;

src/search.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ impl SearchRange {
6060
}
6161

6262
///gets the number of elements represented by the search range
63+
#[inline]
6364
pub (crate) fn len(&self) -> SearchPtr {
6465
match self.is_empty() {
6566
true => 0,
@@ -68,6 +69,7 @@ impl SearchRange {
6869
}
6970

7071
/// Returns an interator over the BWT positions corresponding to this search range
72+
#[inline]
7173
pub (crate) fn range_iter(&self) -> core::ops::Range<SearchPtr> {
7274
match self.is_empty() {
7375
true => 0..0,

0 commit comments

Comments
 (0)