Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,14 @@ set(MMSEQS_CXX_FLAGS "-fsigned-char")

# SIMD instruction sets support
set(MMSEQS_ARCH "")
if (HAVE_AVX2)
if (HAVE_AVX512)
if (CMAKE_COMPILER_IS_CLANG)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16")
else ()
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16 -Wa,-q")
endif ()
set(X64 1 CACHE INTERNAL "")
elseif (HAVE_AVX2)
if (CMAKE_COMPILER_IS_CLANG)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx2 -mcx16")
else ()
Expand Down
8 changes: 6 additions & 2 deletions lib/simd/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@
//#define AVX512
//#endif

#if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE)
#if defined(SIMDE_X86_AVX512BW_NATIVE)
#define AVX512BW
#endif

#if defined(SIMDE_X86_AVX512BW_NATIVE) || defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE)
#define AVX2
#endif

Expand Down Expand Up @@ -694,4 +698,4 @@ inline float ScalarProd20(const float* qi, const float* tj) {
// + tj[18] * qi[18] + tj[19] * qi[19];
}

#endif //SIMD_H
#endif //SIMD_H
192 changes: 143 additions & 49 deletions src/alignment/StripedSmithWaterman.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
/*
Written by Michael Farrar, 2006 (alignment), Mengyao Zhao (SSW Library) and Martin Steinegger (change structure add aa composition, profile and AVX2 support).
Please send bug reports and/or suggestions to [email protected].

AVX512BW support
Modified Copyright (C) 2025 Intel Corporation
Contacts: Ghanshyam Chandra <[email protected]>
*/
#include "Parameters.h"
#include "StripedSmithWaterman.h"
Expand Down Expand Up @@ -56,6 +60,9 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo
profile = new s_profile();
// query profile
profile->profile_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int));
#ifdef AVX512BW
profile->profile_byteBW = (__m512i*)mem_align(AVX512_ALIGN_INT, aaSize * segSize * sizeof(__m512i));
#endif
profile->profile_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int));
profile->profile_rev_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int));
profile->profile_rev_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int));
Expand Down Expand Up @@ -115,6 +122,9 @@ SmithWaterman::~SmithWaterman(){
free(vHmax);
free(target_profile_byte);
free(profile->profile_byte);
#ifdef AVX512BW
free(profile->profile_byteBW);
#endif
free(profile->profile_word);
free(profile->profile_rev_byte);
free(profile->profile_rev_word);
Expand Down Expand Up @@ -157,6 +167,37 @@ SmithWaterman::~SmithWaterman(){
delete profile;
}

#ifdef AVX512BW
/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
template <typename T, size_t Elements, const unsigned int type>
void SmithWaterman::createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat,
const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength)
{
const int32_t segLen = (query_length + Elements - 1) / Elements;
T* t = (T*) profile;
for (int32_t nt = 0; LIKELY(nt < aaSize); nt++) {
for (int32_t i = 0; i < segLen; i++) {
int32_t j = i;
for (size_t segNum = 0; LIKELY(segNum < Elements) ; segNum++) {
// if will be optmized out by compiler
if(type == SUBSTITUTIONMATRIX) { // substitution score for query_seq constrained by nt
// query_sequence starts from 1 to n
*t++ = ( j >= query_length) ? bias : mat[nt * aaSize + query_sequence[j + offset ]] + composition_bias[j + offset] + bias; // mat[nt][q[j]] mat eq 20*20
} if(type == PROFILE) {
// profile starts by 0
// *t++ = (j >= query_length) ? bias : (mat[nt * entryLength + (j + (offset - 1))] + bias); //mat eq L*20 // mat[nt][j]
*t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias;
// // profile starts by 0 // TODO: offset?
// *t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias; //mat eq L*20 // mat[nt][j]
// printf("(%1d, %1d) ", j , *(t-1));
}
j += segLen;
}
}
}
}
#endif


/* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
template <typename T, size_t Elements, const unsigned int type>
Expand Down Expand Up @@ -214,6 +255,21 @@ void SmithWaterman::createTargetProfile(simd_int *profile, const int8_t *mat, co
}
}
}


#ifdef AVX512BW
template <typename T, size_t Elements>
void SmithWaterman::updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize,
uint8_t shift) {
const int32_t segLen = (query_length + Elements - 1) / Elements;
T* t = (T*) profile;
for (uint32_t i = 0; i < segLen * Elements * aaSize; i++) {
t[i] += shift;
}
}
#endif


template <typename T, size_t Elements>
void SmithWaterman::updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize,
uint8_t shift) {
Expand Down Expand Up @@ -347,6 +403,9 @@ s_align SmithWaterman::ssw_align_private (
if (db_bias > profile->bias) {
uint8_t shift = abs(profile->bias - db_bias);
updateQueryProfile<int8_t, VECSIZE_INT * 4>(profile->profile_byte, profile->query_length, profile->alphabetSize, shift);
#ifdef AVX512BW
updateQueryProfile_AVX512BW<int8_t, AVX512_VECSIZE_INT * 4>(profile->profile_byteBW, profile->query_length, profile->alphabetSize, shift);
#endif
}
profile->bias = std::max(db_bias, profile->bias);
createTargetProfile(db_profile_byte, db_mat, db_length, profile->alphabetSize - 1, profile->bias);
Expand Down Expand Up @@ -1307,6 +1366,10 @@ void SmithWaterman::ssw_init(const Sequence* q,
// create byte version of profiles
createQueryProfile<int8_t, VECSIZE_INT * 4, PROFILE>(profile->profile_byte, profile->query_sequence, NULL,
profile->mat, q->L, alphabetSize, profile->bias, 0, q->L);
#ifdef AVX512BW
createQueryProfile_AVX512BW<int8_t, AVX512_VECSIZE_INT * 4, PROFILE>(profile->profile_byteBW, profile->query_sequence, NULL,
profile->mat, q->L, alphabetSize, profile->bias, 0, q->L);
#endif
createConsensProfile<int8_t, VECSIZE_INT * 4>(profile->consens_byte, profile->query_consens_sequence, q->L, 0);
#ifdef GAP_POS_SCORING
createGapProfile<int8_t, VECSIZE_INT * 4>(profile->profile_gDelOpen_byte, profile->profile_gDelClose_byte,
Expand All @@ -1330,7 +1393,10 @@ void SmithWaterman::ssw_init(const Sequence* q,
} else {
// create byte version of query profile
createQueryProfile<int8_t, VECSIZE_INT * 4, SUBSTITUTIONMATRIX>(profile->profile_byte, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0);
// create word version of query profile
#ifdef AVX512BW
createQueryProfile_AVX512BW<int8_t, AVX512_VECSIZE_INT * 4, SUBSTITUTIONMATRIX>(profile->profile_byteBW, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0);
#endif
// create word version of query profile
createQueryProfile<int16_t, VECSIZE_INT * 2, SUBSTITUTIONMATRIX>(profile->profile_word, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, 0, 0, 0);
// create linear version of word profile
for (int32_t i = 0; i< alphabetSize; i++) {
Expand Down Expand Up @@ -1735,63 +1801,91 @@ inline F simd_hmax(const F * in, unsigned int n) {
return current;
}

#ifdef AVX512BW
template<int N>
__m512i shift_right(__m512i a, __m512i carry = _mm512_setzero_si512()) {
static_assert(0 <= N && N <= 64);
if constexpr (N == 0) return a;
if constexpr (N == 64) return carry;
if constexpr (N % 4 == 0) return _mm512_alignr_epi32(carry, a, N / 4);
else {
__m512i a0 = shift_right<(N / 16 + 1) * 16>(a, carry);
__m512i a1 = shift_right<(N / 16) * 16>(a, carry);
return _mm512_alignr_epi8(a0, a1, N % 16);
}
}
template<int N>
__m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) {
return shift_right<64 - N>(carry, a);
}
#endif

int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) {
#define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp;

int i; // position in query bands (0,..,W-1)
int j; // position in db sequence (0,..,dbseq_length-1)
int element_count = (VECSIZE_INT * 4);
const int W = (profile->query_length + (element_count - 1)) /
element_count; // width of bands in query and score matrix = hochgerundetes LQ/16

simd_int *p;
simd_int S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15)
simd_int Smax = simdi_setzero();
simd_int Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values
simd_int *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp.
simd_int *qji; // query profile score in row j (for residue x_j)
simd_int *s_prev_it, *s_curr_it;
simd_int *query_profile_it = (simd_int *) profile->profile_byte;

// Load the score offset to all 16 unsigned byte elements of Soffset
Soffset = simdi8_set(profile->bias);
s_curr = vHStore;
s_prev = vHLoad;

memset(vHStore, 0, W * sizeof(simd_int));
memset(vHLoad, 0, W * sizeof(simd_int));

for (j = 0; j < db_length; ++j) // loop over db sequence positions
{

// Get address of query scores for row j
// AVX512BW
#ifdef AVX512BW
int element_count = AVX512_VECSIZE_INT * 4;
const int W = (profile->query_length + (element_count - 1)) / element_count;
__m512i S, Smax = _mm512_setzero_si512();
__m512i *s_prev = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i));
__m512i *s_curr = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i));
memset(s_prev, 0, W * sizeof(__m512i));
memset(s_curr, 0, W * sizeof(__m512i));
__m512i *qji, *s_prev_it, *s_curr_it;
__m512i *query_profile_it = (__m512i*)profile->profile_byteBW;
__m512i Soffset = _mm512_set1_epi8(profile->bias);
#else // AVX2 or SSE2
int element_count = VECSIZE_INT * 4;
const int W = (profile->query_length + (element_count - 1)) / element_count;
simd_int S, Smax = simdi_setzero();
simd_int *s_prev = vHLoad, *s_curr = vHStore;
memset(vHLoad, 0, W * sizeof(simd_int));
memset(vHStore, 0, W * sizeof(simd_int));
simd_int *qji, *s_prev_it, *s_curr_it;
simd_int *query_profile_it = (simd_int*)profile->profile_byte;
simd_int Soffset = simdi8_set(profile->bias);
#endif

// main Smith-Waterman loop
for (int j = 0; j < db_length; ++j) {
qji = query_profile_it + db_sequence[j] * W;

// Load the next S value
S = simdi_load(s_curr + W - 1);
S = simdi8_shiftl(S, 1);
#ifdef AVX512BW
S = _mm512_load_si512(s_curr + W - 1);
S = shift_right_AVX512<1>(S);
#else
S = simdi_load(s_curr + W - 1);
S = simdi8_shiftl(S, 1);
#endif

// Swap s_prev and s_curr, smax_prev and smax_curr
SWAP(p, s_prev, s_curr);
// swap the buffers
std::swap(s_prev, s_curr);

s_curr_it = s_curr;
s_prev_it = s_prev;

for (i = 0; i < W; ++i) // loop over query band positions
{
// Saturated addition and subtraction to score S(i,j)
S = simdui8_adds(S, *(qji++)); // S(i,j) = S(i-1,j-1) + (q(i,x_j) + Soffset)
S = simdui8_subs(S, Soffset); // S(i,j) = max(0, S(i,j) - Soffset)
simdi_store(s_curr_it++, S); // store S to s_curr[i]
Smax = simdui8_max(Smax, S); // Smax(i,j) = max(Smax(i,j), S(i,j))

// Load the next S and Smax values
S = simdi_load(s_prev_it++);
for (int i = 0; i < W; ++i) {
#ifdef AVX512BW
S = _mm512_adds_epu8(S, *qji++);
S = _mm512_subs_epu8(S, Soffset);
_mm512_store_si512(s_curr_it++, S);
Smax = _mm512_max_epu8(Smax, S);
S = _mm512_load_si512(s_prev_it++);
#else
S = simdui8_adds(S, *qji++);
S = simdui8_subs(S, Soffset);
simdi_store(s_curr_it++, S);
Smax = simdui8_max(Smax, S);
S = simdi_load(s_prev_it++);
#endif
}
}
int score = simd_hmax((unsigned char *) &Smax, element_count);

/* return largest score */
int score = simd_hmax((unsigned char*)&Smax, element_count);


#ifdef AVX512BW
free(s_curr); free(s_prev);
#endif

return score;
#undef SWAP
}
}
19 changes: 18 additions & 1 deletion src/alignment/StripedSmithWaterman.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ class SmithWaterman{
// TODO: private or public?
struct s_profile{
simd_int* profile_byte; // 0: none
#ifdef AVX512BW
__m512i* profile_byteBW; // 0: none
#endif
simd_int* profile_word; // 0: none
simd_int* profile_rev_byte; // 0: none
simd_int* profile_rev_word; // 0: none
Expand Down Expand Up @@ -279,6 +282,10 @@ class SmithWaterman{

simd_int* vHStore;
simd_int* vHLoad;
#ifdef AVX512BW
__m512i* vHStoreBW;
__m512i* vHLoadBW;
#endif
simd_int* vE;
simd_int* vHmax;
uint8_t * maxColumn;
Expand Down Expand Up @@ -387,6 +394,11 @@ class SmithWaterman{
size_t query_id;
size_t target_id;

#ifdef AVX512BW
template <typename T, size_t Elements, const unsigned int type>
void createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat,
const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength);
#endif

template <typename T, size_t Elements, const unsigned int type>
void createQueryProfile(simd_int *profile, const int8_t *query_sequence, const int8_t * composition_bias,
Expand All @@ -403,11 +415,16 @@ class SmithWaterman{

void createTargetProfile(simd_int *profile, const int8_t *mat, const int target_length, const int32_t aaSize, uint8_t bias);

#ifdef AVX512BW
template <typename T, size_t Elements>
void updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift);
#endif

template <typename T, size_t Elements>
void updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift);

uint8_t computeBias(const int32_t target_length, const int8_t *mat, const int32_t aaSize);

void reverseMat(int8_t *rev_mat, const int8_t *mat, const int32_t aaSize, const int32_t target_length);
};
#endif /* SMITH_WATERMAN_SSE2_H */
#endif /* SMITH_WATERMAN_SSE2_H */