Skip to content

Commit 37c8f48

Browse files
committed
Add 64-cor-pluse CPU support for Windows system
1 parent 075d07e commit 37c8f48

File tree

2 files changed

+179
-29
lines changed

2 files changed

+179
-29
lines changed

src/cpu.cpp

Lines changed: 167 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#endif
5454

5555
#if defined __ANDROID__ || defined __OHOS__ || __linux__
56+
#include <cstring>
5657
#if defined __ANDROID__
5758
#if __ANDROID_API__ >= 18
5859
#include <sys/auxv.h> // getauxval()
@@ -878,9 +879,43 @@ static int get_cpucount()
878879
else
879880
count = 1;
880881
#elif defined _WIN32
881-
SYSTEM_INFO system_info;
882-
GetSystemInfo(&system_info);
883-
count = system_info.dwNumberOfProcessors;
882+
typedef BOOL(WINAPI *LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
883+
LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
884+
if (glpiex != NULL) {
885+
DWORD length = 0;
886+
glpiex(RelationAll, NULL, &length);
887+
888+
if (length > 0) {
889+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer =
890+
(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(length);
891+
892+
if (buffer && glpiex(RelationAll, buffer, &length)) {
893+
count = 0;
894+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = buffer;
895+
DWORD offset = 0;
896+
897+
while (offset < length) {
898+
if (ptr->Relationship == RelationProcessorCore) {
899+
for (WORD i = 0; i < ptr->Processor.GroupCount; i++) {
900+
count += __popcnt64(ptr->Processor.GroupMask[i].Mask);
901+
}
902+
}
903+
offset += ptr->Size;
904+
ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)ptr + ptr->Size);
905+
}
906+
}
907+
908+
if (buffer) {
909+
free(buffer);
910+
}
911+
}
912+
}
913+
//If cpu's count <= 64, use the previouse version.
914+
if (count == 0) {
915+
SYSTEM_INFO system_info;
916+
GetSystemInfo(&system_info);
917+
count = system_info.dwNumberOfProcessors;
918+
}
884919
#elif defined __ANDROID__ || defined __linux__
885920
// get cpu count from /proc/cpuinfo
886921
FILE* fp = fopen("/proc/cpuinfo", "rb");
@@ -1355,6 +1390,57 @@ static ncnn::CpuSet get_smt_cpu_mask()
13551390
{
13561391
ncnn::CpuSet smt_cpu_mask;
13571392

1393+
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
1394+
LPFN_GLPI glpiex = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformationEx");
1395+
if (glpiex != NULL) //CPU core > 64
1396+
{
1397+
DWORD length = 0;
1398+
glpiex(RelationProcessorCore, NULL, &length);
1399+
1400+
if (length > 0)
1401+
{
1402+
std::vector<char> buffer(length);
1403+
if (glpiex(RelationProcessorCore, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data(), &length))
1404+
{
1405+
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data();
1406+
1407+
while ((char*)current < buffer.data() + length)
1408+
{
1409+
if (current->Relationship == RelationProcessorCore)
1410+
{
1411+
int total_logical_count = 0;
1412+
for (WORD group = 0; group < current->Processor.GroupCount; group++)
1413+
{
1414+
total_logical_count += __popcnt64(current->Processor.GroupMask[group].Mask);
1415+
}
1416+
1417+
if (total_logical_count > 1)
1418+
{
1419+
for (WORD group = 0; group < current->Processor.GroupCount; group++)
1420+
{
1421+
KAFFINITY mask = current->Processor.GroupMask[group].Mask;
1422+
for (int cpu = 0; cpu < 64 && mask; cpu++)
1423+
{
1424+
if (mask & (1ULL << cpu))
1425+
{
1426+
int global_cpu = group * 64 + cpu;
1427+
smt_cpu_mask.enable(global_cpu);
1428+
mask &= ~(1ULL << cpu);
1429+
}
1430+
}
1431+
}
1432+
}
1433+
}
1434+
1435+
current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char*)current + current->Size);
1436+
}
1437+
1438+
return smt_cpu_mask;
1439+
}
1440+
}
1441+
}
1442+
1443+
// Under 64, use the old API
13581444
typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
13591445
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
13601446
if (glpi == NULL)
@@ -1375,12 +1461,16 @@ static ncnn::CpuSet get_smt_cpu_mask()
13751461
{
13761462
if (ptr->Relationship == RelationProcessorCore)
13771463
{
1378-
ncnn::CpuSet smt_set;
1379-
smt_set.mask = ptr->ProcessorMask;
1380-
if (smt_set.num_enabled() > 1)
1464+
int logical_count = __popcnt64(ptr->ProcessorMask);
1465+
if (logical_count > 1)
13811466
{
1382-
// this core is smt
1383-
smt_cpu_mask.mask |= smt_set.mask;
1467+
ULONG_PTR mask = ptr->ProcessorMask;
1468+
for (int cpu = 0; cpu < 64 && mask; cpu++) {
1469+
if (mask & (1ULL << cpu)) {
1470+
smt_cpu_mask.enable(cpu);
1471+
mask &= ~(1ULL << cpu);
1472+
}
1473+
}
13841474
}
13851475
}
13861476

@@ -1389,7 +1479,6 @@ static ncnn::CpuSet get_smt_cpu_mask()
13891479
}
13901480

13911481
free(buffer);
1392-
13931482
return smt_cpu_mask;
13941483
}
13951484

@@ -1435,13 +1524,25 @@ static std::vector<int> get_max_freq_mhz()
14351524

14361525
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
14371526
{
1438-
DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
1439-
if (prev_mask == 0)
1527+
for (int group = 0; group < thread_affinity_mask.active_groups; group++)
14401528
{
1441-
NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
1442-
return -1;
1529+
if (thread_affinity_mask.masks[group] != 0)
1530+
{
1531+
GROUP_AFFINITY groupAffinity;
1532+
groupAffinity.Mask = thread_affinity_mask.masks[group];
1533+
groupAffinity.Group = (WORD)group;
1534+
groupAffinity.Reserved[0] = 0;
1535+
groupAffinity.Reserved[1] = 0;
1536+
groupAffinity.Reserved[2] = 0;
1537+
1538+
if (!SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL))
1539+
{
1540+
NCNN_LOGE("SetThreadGroupAffinity failed %d", GetLastError());
1541+
return -1;
1542+
}
1543+
break;
1544+
}
14431545
}
1444-
14451546
return 0;
14461547
}
14471548
#endif // defined _WIN32
@@ -1610,6 +1711,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
16101711
}
16111712
#endif // __APPLE__
16121713

1714+
16131715
static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
16141716
{
16151717
mask_all.disable_all();
@@ -2152,7 +2254,7 @@ static void initialize_global_cpu_info()
21522254

21532255
g_cpucount = get_cpucount();
21542256
g_physical_cpucount = get_physical_cpucount();
2155-
g_powersave = 0;
2257+
g_powersave = 0;
21562258
initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
21572259

21582260
#if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
@@ -2277,34 +2379,74 @@ CpuSet::CpuSet()
22772379

22782380
void CpuSet::enable(int cpu)
22792381
{
2280-
mask |= ((ULONG_PTR)1 << cpu);
2382+
if (cpu < 0 || cpu >= max_cpus) return;
2383+
2384+
int group = cpu / 64;
2385+
int bit = cpu % 64;
2386+
2387+
if (group < MAX_CPU_GROUPS) {
2388+
masks[group] |= (1ULL << bit);
2389+
}
22812390
}
22822391

22832392
void CpuSet::disable(int cpu)
22842393
{
2285-
mask &= ~((ULONG_PTR)1 << cpu);
2394+
if (cpu < 0 || cpu >= max_cpus) return;
2395+
2396+
int group = cpu / 64;
2397+
int bit = cpu % 64;
2398+
2399+
if (group < MAX_CPU_GROUPS) {
2400+
masks[group] &= ~(1ULL << bit);
2401+
}
22862402
}
22872403

22882404
void CpuSet::disable_all()
22892405
{
2290-
mask = 0;
2406+
for (int i = 0; i < MAX_CPU_GROUPS; i++) {
2407+
masks[i] = 0;
2408+
}
22912409
}
22922410

22932411
bool CpuSet::is_enabled(int cpu) const
22942412
{
2295-
return mask & ((ULONG_PTR)1 << cpu);
2413+
if (cpu < 0 || cpu >= max_cpus) return false;
2414+
2415+
int group = cpu / 64;
2416+
int bit = cpu % 64;
2417+
2418+
if (group < MAX_CPU_GROUPS) {
2419+
return (masks[group] & (1ULL << bit)) != 0;
2420+
}
2421+
return false;
22962422
}
22972423

22982424
int CpuSet::num_enabled() const
22992425
{
2300-
int num_enabled = 0;
2301-
for (int i = 0; i < (int)sizeof(mask) * 8; i++)
2302-
{
2303-
if (is_enabled(i))
2304-
num_enabled++;
2426+
int count = 0;
2427+
for (int i = 0; i < MAX_CPU_GROUPS; i++) {
2428+
count += __builtin_popcountll(masks[i]);
23052429
}
2430+
return count;
2431+
}
23062432

2307-
return num_enabled;
2433+
ULONG_PTR CpuSet::get_group_mask(int group) const
2434+
{
2435+
if (group < 0 || group >= MAX_CPU_GROUPS) {
2436+
return 0;
2437+
}
2438+
return masks[group];
2439+
}
2440+
2441+
int CpuSet::get_active_group_count() const
2442+
{
2443+
int count = 0;
2444+
for (int i = 0; i < MAX_CPU_GROUPS; i++) {
2445+
if (masks[i] != 0) {
2446+
count++;
2447+
}
2448+
}
2449+
return count;
23082450
}
23092451
#elif defined __ANDROID__ || defined __linux__
23102452
CpuSet::CpuSet()

src/cpu.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,19 @@ class NCNN_EXPORT CpuSet
3838
void disable_all();
3939
bool is_enabled(int cpu) const;
4040
int num_enabled() const;
41+
42+
#if defined _WIN32
43+
int get_max_cpus() const { return max_cpus; }
44+
ULONG_PTR get_group_mask(int group) const;
45+
int get_active_group_count() const;
46+
#endif
4147

4248
public:
4349
#if defined _WIN32
44-
ULONG_PTR mask;
50+
static const int MAX_CPU_GROUPS = 20;
51+
ULONG_PTR masks[MAX_CPU_GROUPS];
52+
int max_cpus;
53+
int active_groups;
4554
#endif
4655
#if defined __ANDROID__ || defined __linux__
4756
cpu_set_t cpu_set;
@@ -129,7 +138,6 @@ NCNN_EXPORT int cpu_support_riscv_xtheadvector();
129138
// vlenb = riscv vector length in bytes
130139
NCNN_EXPORT int cpu_riscv_vlenb();
131140

132-
// cpu info
133141
NCNN_EXPORT int get_cpu_count();
134142
NCNN_EXPORT int get_little_cpu_count();
135143
NCNN_EXPORT int get_big_cpu_count();
@@ -138,7 +146,7 @@ NCNN_EXPORT int get_physical_cpu_count();
138146
NCNN_EXPORT int get_physical_little_cpu_count();
139147
NCNN_EXPORT int get_physical_big_cpu_count();
140148

141-
// cpu l2 varies from 64k to 1M, but l3 can be zero
149+
// cpu l2 varies from 64k to 1M, but l3 can be zero
142150
NCNN_EXPORT int get_cpu_level2_cache_size();
143151
NCNN_EXPORT int get_cpu_level3_cache_size();
144152

@@ -153,7 +161,7 @@ NCNN_EXPORT int get_cpu_level3_cache_size();
153161
NCNN_EXPORT int get_cpu_powersave();
154162
NCNN_EXPORT int set_cpu_powersave(int powersave);
155163

156-
// convenient wrapper
164+
// convenient wrapper
157165
NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
158166

159167
// set explicit thread affinity

0 commit comments

Comments
 (0)