Skip to content

Commit 9e617ee

Browse files
committed
implement requested changes
1 parent bc5d34b commit 9e617ee

File tree

6 files changed

+296
-211
lines changed

6 files changed

+296
-211
lines changed

src/layer/x86/avx_usability.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
static inline __m256 loadfp16(const unsigned short* ptr)
2222
{
23-
return _mm256_cvtph_ps(_mm_load_si128((__m128i*)(ptr)));
23+
return _mm256_cvtph_ps(_mm_lddqu_si128((__m128i*)(ptr)));
2424
}
2525
static inline __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c)
2626
{

src/layer/x86/convolution_3x3_pack8.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,10 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M
6565
}
6666
// interleave
6767
// src = 64-inch-outch
68-
// dst = 4b-4a-inch/4a-64-outch/4b;
69-
kernel_tm_pack8.create(inch / 8, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 64, 64);
68+
// dst = 8b-8a-inch/8a-64-outch/8b;
69+
kernel_tm_pack8.create(inch / 8, 64, outch / 8, (size_t)4u * 64, 64);
70+
71+
// kernel_tm_pack8.create(inch / 8, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 64, 64);
7072
int q = 0;
7173
for (; q + 7 < outch; q += 8)
7274
{

src/layer/x86/convolution_3x3_pack8_fp16.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern
6565
}
6666
// interleave
6767
// src = 64-inch-outch
68-
// dst = 8b-8a-inch/8a-84-outch/8b;
69-
kernel_tm_pack8.create(inch / 8, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)2u * 64, 64);
68+
// dst = 8b-8a-inch/8a-64-outch/8b;
69+
kernel_tm_pack8.create(inch / 8, 64, outch / 8, (size_t)2u * 64, 64);
7070
int q = 0;
7171
for (; q + 7 < outch; q += 8)
7272
{

0 commit comments

Comments
 (0)