53
53
#endif
54
54
55
55
#if defined __ANDROID__ || defined __OHOS__ || __linux__
56
+ #include < cstring>
56
57
#if defined __ANDROID__
57
58
#if __ANDROID_API__ >= 18
58
59
#include < sys/auxv.h> // getauxval()
@@ -878,9 +879,43 @@ static int get_cpucount()
878
879
else
879
880
count = 1 ;
880
881
#elif defined _WIN32
881
- SYSTEM_INFO system_info;
882
- GetSystemInfo (&system_info);
883
- count = system_info.dwNumberOfProcessors ;
882
+ typedef BOOL (WINAPI *LPFN_GLPIEX)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
883
+ LPFN_GLPIEX glpiex = (LPFN_GLPIEX)GetProcAddress (GetModuleHandle (TEXT (" kernel32" )), " GetLogicalProcessorInformationEx" );
884
+ if (glpiex != NULL ) {
885
+ DWORD length = 0 ;
886
+ glpiex (RelationAll, NULL , &length);
887
+
888
+ if (length > 0 ) {
889
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer =
890
+ (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc (length);
891
+
892
+ if (buffer && glpiex (RelationAll, buffer, &length)) {
893
+ count = 0 ;
894
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ptr = buffer;
895
+ DWORD offset = 0 ;
896
+
897
+ while (offset < length) {
898
+ if (ptr->Relationship == RelationProcessorCore) {
899
+ for (WORD i = 0 ; i < ptr->Processor .GroupCount ; i++) {
900
+ count += __popcnt64 (ptr->Processor .GroupMask [i].Mask );
901
+ }
902
+ }
903
+ offset += ptr->Size ;
904
+ ptr = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char *)ptr + ptr->Size );
905
+ }
906
+ }
907
+
908
+ if (buffer) {
909
+ free (buffer);
910
+ }
911
+ }
912
+ }
913
+ // If cpu's count <= 64, use the previouse version.
914
+ if (count == 0 ) {
915
+ SYSTEM_INFO system_info;
916
+ GetSystemInfo (&system_info);
917
+ count = system_info.dwNumberOfProcessors ;
918
+ }
884
919
#elif defined __ANDROID__ || defined __linux__
885
920
// get cpu count from /proc/cpuinfo
886
921
FILE* fp = fopen (" /proc/cpuinfo" , " rb" );
@@ -1355,6 +1390,57 @@ static ncnn::CpuSet get_smt_cpu_mask()
1355
1390
{
1356
1391
ncnn::CpuSet smt_cpu_mask;
1357
1392
1393
+ typedef BOOL (WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
1394
+ LPFN_GLPI glpiex = (LPFN_GLPI)GetProcAddress (GetModuleHandle (TEXT (" kernel32" )), " GetLogicalProcessorInformationEx" );
1395
+ if (glpiex != NULL ) // CPU core > 64
1396
+ {
1397
+ DWORD length = 0 ;
1398
+ glpiex (RelationProcessorCore, NULL , &length);
1399
+
1400
+ if (length > 0 )
1401
+ {
1402
+ std::vector<char > buffer (length);
1403
+ if (glpiex (RelationProcessorCore, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data (), &length))
1404
+ {
1405
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)buffer.data ();
1406
+
1407
+ while ((char *)current < buffer.data () + length)
1408
+ {
1409
+ if (current->Relationship == RelationProcessorCore)
1410
+ {
1411
+ int total_logical_count = 0 ;
1412
+ for (WORD group = 0 ; group < current->Processor .GroupCount ; group++)
1413
+ {
1414
+ total_logical_count += __popcnt64 (current->Processor .GroupMask [group].Mask );
1415
+ }
1416
+
1417
+ if (total_logical_count > 1 )
1418
+ {
1419
+ for (WORD group = 0 ; group < current->Processor .GroupCount ; group++)
1420
+ {
1421
+ KAFFINITY mask = current->Processor .GroupMask [group].Mask ;
1422
+ for (int cpu = 0 ; cpu < 64 && mask; cpu++)
1423
+ {
1424
+ if (mask & (1ULL << cpu))
1425
+ {
1426
+ int global_cpu = group * 64 + cpu;
1427
+ smt_cpu_mask.enable (global_cpu);
1428
+ mask &= ~(1ULL << cpu);
1429
+ }
1430
+ }
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ current = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)((char *)current + current->Size );
1436
+ }
1437
+
1438
+ return smt_cpu_mask;
1439
+ }
1440
+ }
1441
+ }
1442
+
1443
+ // Under 64, use the old API
1358
1444
typedef BOOL (WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
1359
1445
LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress (GetModuleHandle (TEXT (" kernel32" )), " GetLogicalProcessorInformation" );
1360
1446
if (glpi == NULL )
@@ -1375,12 +1461,16 @@ static ncnn::CpuSet get_smt_cpu_mask()
1375
1461
{
1376
1462
if (ptr->Relationship == RelationProcessorCore)
1377
1463
{
1378
- ncnn::CpuSet smt_set;
1379
- smt_set.mask = ptr->ProcessorMask ;
1380
- if (smt_set.num_enabled () > 1 )
1464
+ int logical_count = __popcnt64 (ptr->ProcessorMask );
1465
+ if (logical_count > 1 )
1381
1466
{
1382
- // this core is smt
1383
- smt_cpu_mask.mask |= smt_set.mask ;
1467
+ ULONG_PTR mask = ptr->ProcessorMask ;
1468
+ for (int cpu = 0 ; cpu < 64 && mask; cpu++) {
1469
+ if (mask & (1ULL << cpu)) {
1470
+ smt_cpu_mask.enable (cpu);
1471
+ mask &= ~(1ULL << cpu);
1472
+ }
1473
+ }
1384
1474
}
1385
1475
}
1386
1476
@@ -1389,7 +1479,6 @@ static ncnn::CpuSet get_smt_cpu_mask()
1389
1479
}
1390
1480
1391
1481
free (buffer);
1392
-
1393
1482
return smt_cpu_mask;
1394
1483
}
1395
1484
@@ -1435,13 +1524,25 @@ static std::vector<int> get_max_freq_mhz()
1435
1524
1436
1525
static int set_sched_affinity (const ncnn::CpuSet& thread_affinity_mask)
1437
1526
{
1438
- DWORD_PTR prev_mask = SetThreadAffinityMask (GetCurrentThread (), thread_affinity_mask.mask );
1439
- if (prev_mask == 0 )
1527
+ for (int group = 0 ; group < thread_affinity_mask.active_groups ; group++)
1440
1528
{
1441
- NCNN_LOGE (" SetThreadAffinityMask failed %d" , GetLastError ());
1442
- return -1 ;
1529
+ if (thread_affinity_mask.masks [group] != 0 )
1530
+ {
1531
+ GROUP_AFFINITY groupAffinity;
1532
+ groupAffinity.Mask = thread_affinity_mask.masks [group];
1533
+ groupAffinity.Group = (WORD)group;
1534
+ groupAffinity.Reserved [0 ] = 0 ;
1535
+ groupAffinity.Reserved [1 ] = 0 ;
1536
+ groupAffinity.Reserved [2 ] = 0 ;
1537
+
1538
+ if (!SetThreadGroupAffinity (GetCurrentThread (), &groupAffinity, NULL ))
1539
+ {
1540
+ NCNN_LOGE (" SetThreadGroupAffinity failed %d" , GetLastError ());
1541
+ return -1 ;
1542
+ }
1543
+ break ;
1544
+ }
1443
1545
}
1444
-
1445
1546
return 0 ;
1446
1547
}
1447
1548
#endif // defined _WIN32
@@ -1610,6 +1711,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
1610
1711
}
1611
1712
#endif // __APPLE__
1612
1713
1714
+
1613
1715
static void initialize_cpu_thread_affinity_mask (ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
1614
1716
{
1615
1717
mask_all.disable_all ();
@@ -2152,7 +2254,7 @@ static void initialize_global_cpu_info()
2152
2254
2153
2255
g_cpucount = get_cpucount ();
2154
2256
g_physical_cpucount = get_physical_cpucount ();
2155
- g_powersave = 0 ;
2257
+ g_powersave = 0 ;
2156
2258
initialize_cpu_thread_affinity_mask (g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
2157
2259
2158
2260
#if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
@@ -2277,34 +2379,74 @@ CpuSet::CpuSet()
2277
2379
2278
2380
void CpuSet::enable (int cpu)
2279
2381
{
2280
- mask |= ((ULONG_PTR)1 << cpu);
2382
+ if (cpu < 0 || cpu >= max_cpus) return ;
2383
+
2384
+ int group = cpu / 64 ;
2385
+ int bit = cpu % 64 ;
2386
+
2387
+ if (group < MAX_CPU_GROUPS) {
2388
+ masks[group] |= (1ULL << bit);
2389
+ }
2281
2390
}
2282
2391
2283
2392
void CpuSet::disable (int cpu)
2284
2393
{
2285
- mask &= ~((ULONG_PTR)1 << cpu);
2394
+ if (cpu < 0 || cpu >= max_cpus) return ;
2395
+
2396
+ int group = cpu / 64 ;
2397
+ int bit = cpu % 64 ;
2398
+
2399
+ if (group < MAX_CPU_GROUPS) {
2400
+ masks[group] &= ~(1ULL << bit);
2401
+ }
2286
2402
}
2287
2403
2288
2404
void CpuSet::disable_all ()
2289
2405
{
2290
- mask = 0 ;
2406
+ for (int i = 0 ; i < MAX_CPU_GROUPS; i++) {
2407
+ masks[i] = 0 ;
2408
+ }
2291
2409
}
2292
2410
2293
2411
bool CpuSet::is_enabled (int cpu) const
2294
2412
{
2295
- return mask & ((ULONG_PTR)1 << cpu);
2413
+ if (cpu < 0 || cpu >= max_cpus) return false ;
2414
+
2415
+ int group = cpu / 64 ;
2416
+ int bit = cpu % 64 ;
2417
+
2418
+ if (group < MAX_CPU_GROUPS) {
2419
+ return (masks[group] & (1ULL << bit)) != 0 ;
2420
+ }
2421
+ return false ;
2296
2422
}
2297
2423
2298
2424
int CpuSet::num_enabled () const
2299
2425
{
2300
- int num_enabled = 0 ;
2301
- for (int i = 0 ; i < (int )sizeof (mask) * 8 ; i++)
2302
- {
2303
- if (is_enabled (i))
2304
- num_enabled++;
2426
+ int count = 0 ;
2427
+ for (int i = 0 ; i < MAX_CPU_GROUPS; i++) {
2428
+ count += __builtin_popcountll (masks[i]);
2305
2429
}
2430
+ return count;
2431
+ }
2306
2432
2307
- return num_enabled;
2433
+ ULONG_PTR CpuSet::get_group_mask (int group) const
2434
+ {
2435
+ if (group < 0 || group >= MAX_CPU_GROUPS) {
2436
+ return 0 ;
2437
+ }
2438
+ return masks[group];
2439
+ }
2440
+
2441
+ int CpuSet::get_active_group_count () const
2442
+ {
2443
+ int count = 0 ;
2444
+ for (int i = 0 ; i < MAX_CPU_GROUPS; i++) {
2445
+ if (masks[i] != 0 ) {
2446
+ count++;
2447
+ }
2448
+ }
2449
+ return count;
2308
2450
}
2309
2451
#elif defined __ANDROID__ || defined __linux__
2310
2452
CpuSet::CpuSet ()
0 commit comments