|  | 
| 4 | 4 |  *  @author     Ash Vardanian | 
| 5 | 5 |  *  @date       October 16, 2024 | 
| 6 | 6 |  * | 
| 7 |  | - *  Contains following element-wise operations: | 
|  | 7 | + *  Contains following @b Unary/Binary/Ternary element-wise operations: | 
| 8 | 8 |  *  - Scale (Multiply) with Shift: R[i] = Alpha * A[i] + Beta | 
| 9 | 9 |  *  - Sum (Add): R[i] = A[i] + B[i] | 
| 10 | 10 |  *  - WSum or Weighted-Sum: R[i] = Alpha * A[i] + Beta * B[i] | 
| @@ -1211,6 +1211,110 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( | 
| 1211 | 1211 |     } | 
| 1212 | 1212 | } | 
| 1213 | 1213 | 
 | 
|  | 1214 | +SIMSIMD_PUBLIC void simsimd_sum_i16_haswell(simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_size_t n, | 
|  | 1215 | +                                            simsimd_i16_t *result) { | 
|  | 1216 | +    // The main loop: | 
|  | 1217 | +    simsimd_size_t i = 0; | 
|  | 1218 | +    for (; i + 16 <= n; i += 16) { | 
|  | 1219 | +        __m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i)); | 
|  | 1220 | +        __m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i)); | 
|  | 1221 | +        __m256i sum_vec = _mm256_adds_epi16(a_vec, b_vec); | 
|  | 1222 | +        _mm256_storeu_si256((__m256i *)(result + i), sum_vec); | 
|  | 1223 | +    } | 
|  | 1224 | + | 
|  | 1225 | +    // The tail: | 
|  | 1226 | +    for (; i < n; ++i) { | 
|  | 1227 | +        simsimd_i64_t ai = a[i], bi = b[i]; | 
|  | 1228 | +        simsimd_i64_t sum = ai + bi; | 
|  | 1229 | +        _simsimd_i64_to_i16(&sum, result + i); | 
|  | 1230 | +    } | 
|  | 1231 | +} | 
|  | 1232 | + | 
|  | 1233 | +SIMSIMD_PUBLIC void simsimd_scale_i16_haswell(simsimd_i16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, | 
|  | 1234 | +                                              simsimd_distance_t beta, simsimd_i16_t *result) { | 
|  | 1235 | + | 
|  | 1236 | +    simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; | 
|  | 1237 | +    simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; | 
|  | 1238 | +    __m256 alpha_vec = _mm256_set1_ps(alpha_f32); | 
|  | 1239 | +    __m256 beta_vec = _mm256_set1_ps(beta_f32); | 
|  | 1240 | + | 
|  | 1241 | +    // The main loop: | 
|  | 1242 | +    simsimd_size_t i = 0; | 
|  | 1243 | +    for (; i + 8 <= n; i += 8) { | 
|  | 1244 | +        __m256 a_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); | 
|  | 1245 | +        __m256 b_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); | 
|  | 1246 | +        __m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec); | 
|  | 1247 | +        __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); | 
|  | 1248 | +        sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-32768)); | 
|  | 1249 | +        sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(32767)); | 
|  | 1250 | +        __m128i sum_i16_vec = | 
|  | 1251 | +            _mm_packs_epi32(_mm256_castsi256_si128(sum_i32_vec), _mm256_extracti128_si256(sum_i32_vec, 1)); | 
|  | 1252 | +        _mm_storeu_si128((__m128i *)(result + i), sum_i16_vec); | 
|  | 1253 | +    } | 
|  | 1254 | + | 
|  | 1255 | +    // The tail: | 
|  | 1256 | +    for (; i < n; ++i) { | 
|  | 1257 | +        simsimd_f32_t ai = a[i]; | 
|  | 1258 | +        simsimd_f32_t sum = alpha_f32 * ai + beta_f32; | 
|  | 1259 | +        _simsimd_f32_to_i16(&sum, result + i); | 
|  | 1260 | +    } | 
|  | 1261 | +} | 
|  | 1262 | + | 
|  | 1263 | +SIMSIMD_PUBLIC void simsimd_fma_i16_haswell(                                                  // | 
|  | 1264 | +    simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_i16_t const *c, simsimd_size_t n, // | 
|  | 1265 | +    simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i16_t *result) { | 
|  | 1266 | +#if 0 | 
|  | 1267 | +    simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; | 
|  | 1268 | +    simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; | 
|  | 1269 | +    __m256 alpha_vec = _mm256_set1_ps(alpha_f32); | 
|  | 1270 | +    __m256 beta_vec = _mm256_set1_ps(beta_f32); | 
|  | 1271 | +    int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8]; | 
|  | 1272 | + | 
|  | 1273 | +    // The main loop: | 
|  | 1274 | +    simsimd_size_t i = 0; | 
|  | 1275 | +    for (; i + 8 <= n; i += 8) { | 
|  | 1276 | +        //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the | 
|  | 1277 | +        //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day. | 
|  | 1278 | +        a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], // | 
|  | 1279 | +            a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7]; | 
|  | 1280 | +        b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], // | 
|  | 1281 | +            b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7]; | 
|  | 1282 | +        c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], // | 
|  | 1283 | +            c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7]; | 
|  | 1284 | +        //! This can be done at least 50% faster if we convert 8-bit integers to floats instead | 
|  | 1285 | +        //! of relying on the slow `_mm256_cvtepi32_ps` instruction. | 
|  | 1286 | +        __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); | 
|  | 1287 | +        __m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s)); | 
|  | 1288 | +        __m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s)); | 
|  | 1289 | +        // The normal part. | 
|  | 1290 | +        __m256 ab_vec = _mm256_mul_ps(a_vec, b_vec); | 
|  | 1291 | +        __m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec); | 
|  | 1292 | +        __m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec); | 
|  | 1293 | +        // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. | 
|  | 1294 | +        __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); | 
|  | 1295 | +        sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); | 
|  | 1296 | +        sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); | 
|  | 1297 | +        // Export into a serial buffer. | 
|  | 1298 | +        _mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec); | 
|  | 1299 | +        result[i + 0] = (simsimd_i16_t)sum_i32s[0]; | 
|  | 1300 | +        result[i + 1] = (simsimd_i16_t)sum_i32s[1]; | 
|  | 1301 | +        result[i + 2] = (simsimd_i16_t)sum_i32s[2]; | 
|  | 1302 | +        result[i + 3] = (simsimd_i16_t)sum_i32s[3]; | 
|  | 1303 | +        result[i + 4] = (simsimd_i16_t)sum_i32s[4]; | 
|  | 1304 | +        result[i + 5] = (simsimd_i16_t)sum_i32s[5]; | 
|  | 1305 | +        result[i + 6] = (simsimd_i16_t)sum_i32s[6]; | 
|  | 1306 | +        result[i + 7] = (simsimd_i16_t)sum_i32s[7]; | 
|  | 1307 | +    } | 
|  | 1308 | + | 
|  | 1309 | +    // The tail: | 
|  | 1310 | +    for (; i < n; ++i) { | 
|  | 1311 | +        simsimd_f32_t ai = a[i], bi = b[i], ci = c[i]; | 
|  | 1312 | +        simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci; | 
|  | 1313 | +        _simsimd_f32_to_i16(&sum, result + i); | 
|  | 1314 | +    } | 
|  | 1315 | +#endif | 
|  | 1316 | +} | 
|  | 1317 | + | 
| 1214 | 1318 | #pragma clang attribute pop | 
| 1215 | 1319 | #pragma GCC pop_options | 
| 1216 | 1320 | #endif // SIMSIMD_TARGET_HASWELL | 
|  | 
0 commit comments