Skip to content

Commit 4b97730

Browse files
authored
x86 packed convolution transform kernel avx2/avx512 optimization (#4819)
* fix non-sse non-neon weight pack
1 parent 6c21b08 commit 4b97730

File tree

6 files changed

+1101
-904
lines changed

6 files changed

+1101
-904
lines changed

src/layer/arm/convolution1d_packed.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel
6060
#endif // __aarch64__
6161
if (inh >= 4)
6262
kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2);
63-
else if (inh >= 2)
64-
kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2);
6563
else
6664
#endif // __ARM_NEON
65+
if (inh >= 2)
66+
kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2);
67+
else
6768
kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2);
6869
}
6970
else
@@ -76,10 +77,11 @@ static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel
7677
#endif // __aarch64__
7778
if (inh >= 4)
7879
kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh);
79-
else if (inh >= 2)
80-
kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh);
8180
else
8281
#endif // __ARM_NEON
82+
if (inh >= 2)
83+
kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh);
84+
else
8385
kernel_tm.create(kernel_w, inh, outh);
8486
}
8587
// *INDENT-ON*

src/layer/arm/convolution1d_packed_bf16s.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,11 @@ static void convolution1d_transform_kernel_packed_bf16s(const Mat& kernel, Mat&
6060
#endif // __aarch64__
6161
if (inh >= 4)
6262
kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
63-
else if (inh >= 2)
64-
kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
6563
else
6664
#endif // __ARM_NEON
65+
if (inh >= 2)
66+
kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
67+
else
6768
kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2, (size_t)2u);
6869
}
6970
else
@@ -76,10 +77,11 @@ static void convolution1d_transform_kernel_packed_bf16s(const Mat& kernel, Mat&
7677
#endif // __aarch64__
7778
if (inh >= 4)
7879
kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u);
79-
else if (inh >= 2)
80-
kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u);
8180
else
8281
#endif // __ARM_NEON
82+
if (inh >= 2)
83+
kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u);
84+
else
8385
kernel_tm.create(kernel_w, inh, outh, (size_t)2u);
8486
}
8587
// *INDENT-ON*

src/layer/arm/convolution_packed.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t
6262
#endif // __aarch64__
6363
if (inch >= 4)
6464
kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2);
65-
else if (inch >= 2)
66-
kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2);
6765
else
6866
#endif // __ARM_NEON
67+
if (inch >= 2)
68+
kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2);
69+
else
6970
kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
7071
}
7172
else
@@ -78,10 +79,11 @@ static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_t
7879
#endif // __aarch64__
7980
if (inch >= 4)
8081
kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch);
81-
else if (inch >= 2)
82-
kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch);
8382
else
8483
#endif // __ARM_NEON
84+
if (inch >= 2)
85+
kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch);
86+
else
8587
kernel_tm.create(maxk, inch, outch);
8688
}
8789
// *INDENT-ON*

src/layer/arm/convolution_packed_bf16s.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ static void convolution_transform_kernel_packed_bf16s(const Mat& kernel, Mat& ke
6262
#endif // __aarch64__
6363
if (inch >= 4)
6464
kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
65-
else if (inch >= 2)
66-
kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
6765
else
6866
#endif // __ARM_NEON
67+
if (inch >= 2)
68+
kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
69+
else
6970
kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)2u);
7071
}
7172
else
@@ -78,10 +79,11 @@ static void convolution_transform_kernel_packed_bf16s(const Mat& kernel, Mat& ke
7879
#endif // __aarch64__
7980
if (inch >= 4)
8081
kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u);
81-
else if (inch >= 2)
82-
kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u);
8382
else
8483
#endif // __ARM_NEON
84+
if (inch >= 2)
85+
kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u);
86+
else
8587
kernel_tm.create(maxk, inch, outch, (size_t)2u);
8688
}
8789
// *INDENT-ON*

0 commit comments

Comments
 (0)