Skip to content

Cannot access PMU counters on c8g.metal-24x instance #446

@ashtonsix

Description

@ashtonsix

Created c8g.metal-24x instance but not working:

Image

Documentation says c8g.metal-24x has pmu access here:

## How to Collect PMU counters
Not all instance sizes support PMU event collection. Generally
instance sizes which have an entire dedicated socket have full access
to all PMU events, and smaller instance sizes of the newer generations
have a reduced set of events suitable for most profiling needs. For
older generations smaller instance sizes may not support any PMU event
collection. The table below captures these details:
|Instance Family | Minimum Size for Full PMU Event Support | Basic Support at Smaller Sizes
|------|------------|------|
|*8g | 24xlarge | yes |
|*7a | 24xlarge | yes |
|*7g | 16xlarge | yes |
|*7i | 24xlarge | yes |
|*6a | 24xlarge | no |
|*6g | 16xlarge | yes |
|*6i | 16xlarge | no |
|*5 | c5.9xlarge, *5.12xlarge | no

Do not see "pmu" in feature list for /proc/cpuinfo

$ cat /proc/cpuinfo
processor       : 95
BogoMIPS        : 2000.00
Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 asimddp sha512 sve asimdfhm dit uscat ilrcpc flagm ssbs sb paca pacg dcpodp sve2 sveaes svepmull svebitperm svesha3 flagm2 frint svei8mm svebf16 i8mm bf16 dgh rng bti
CPU implementer : 0x41
CPU architecture: 8
CPU variant     : 0x0
CPU part        : 0xd4f
CPU revision    : 1

Need to identify which method for zero-extending from 8-bit to 32-bit values is fastest on Neoverse V2 cores (like Graviton4).

My code fails with Illegal instruction with PMU_ENABLED (first run without PMU is dummy to make sure no crash without flag).

➜  /benchmark make clean && make && make run 
rm -rf build
cc -O3 -march=armv8.2-a -c src/main.cpp -o build/main.o
cc -O3 -march=armv8.2-a -c src/impl.S -o build/impl.o
cc -O3 -march=armv8.2-a -o build/benchmark build/main.o build/impl.o -static
./build/benchmark
Warming up option A...
Warming up option B...
Warming up option C...
Warming up option D...
Warming up option E...
Running 1000000 iterations per option...

Average Cycles Per Call:
Option A (st4):     0.00 cycles (theoretical: 6.0)
Option B (uxtl):    0.00 cycles (theoretical: 3.0)
Option C (zip):     0.00 cycles (theoretical: 1.5)
Option D (tbl):     0.00 cycles (theoretical: 2.0)
Option E (hybrid):  0.00 cycles (theoretical: 1.25)

Performance Ranking (best to worst):
1. Option A (st4): 0.00 cycles (0.00× theoretical)
2. Option B (uxtl): 0.00 cycles (0.00× theoretical)
3. Option C (zip): 0.00 cycles (0.00× theoretical)
4. Option D (tbl): 0.00 cycles (0.00× theoretical)
5. Option E (hybrid): 0.00 cycles (0.00× theoretical)
➜  /benchmark make clean && make pmu && make run
rm -rf build
cc -O3 -march=armv8.2-a -DPMU_ENABLED -c src/main.cpp -o build/main.o
cc -O3 -march=armv8.2-a -DPMU_ENABLED -c src/impl.S -o build/impl.o
cc -O3 -march=armv8.2-a -DPMU_ENABLED -o build/benchmark build/main.o build/impl.o -static
./build/benchmark
make: *** [Makefile:31: run] Illegal instruction (core dumped)

I access PMU like this:

// Initialize performance counter
static void init_perfcounter(void) {
  // Enable user-mode access to performance counters
  asm volatile("msr pmuserenr_el0, %0" : : "r"((uint64_t)1));
  // Enable performance counter
  asm volatile("msr pmcntenset_el0, %0" : : "r"((uint64_t)(1 << 31)));
  // Clear overflow flags
  asm volatile("msr pmovsclr_el0, %0" : : "r"((uint64_t)0xffffffff));
}

and like:

.section .rodata
.align 4
.Ltbl_zext:
    .byte 0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, 3, 16, 16, 16
    .byte 4, 16, 16, 16, 5, 16, 16, 16, 6, 16, 16, 16, 7, 16, 16, 16
    .byte 8, 16, 16, 16, 9, 16, 16, 16, 10, 16, 16, 16, 11, 16, 16, 16
    .byte 12, 16, 16, 16, 13, 16, 16, 16, 14, 16, 16, 16, 15, 16, 16, 16

.text

//-----------------------------------------------------------------------------
// Option D: Using tbl (2 cycles theoretical)
//-----------------------------------------------------------------------------
.align 4
_Z18benchmark_option_d12__Uint8x16_tPv:
    // Input bytes are already in q0

    // Load lookup tables into private registers (v20-v23)
    adrp x3, .Ltbl_zext
    add x3, x3, :lo12:.Ltbl_zext
    ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3]

    // DSB to ensure table load is complete before starting measurement
    dsb sy
    
    // Start cycle counter
#ifdef PMU_ENABLED
    mrs x2, pmccntr_el0
#endif
    
.rept 12
    // Use table lookups to zero-extend
    tbl v1.16b, {v0.16b}, v21.16b
    tbl v2.16b, {v0.16b}, v22.16b
    tbl v3.16b, {v0.16b}, v23.16b
    tbl v4.16b, {v0.16b}, v20.16b
    
    // Store results
    stp q4, q1, [x0]
    stp q2, q3, [x0, #32]
.endr
    
    // End cycle counter
#ifdef PMU_ENABLED
    mrs x0, pmccntr_el0
    sub x0, x0, x2
#else
    mov x0, #0
#endif
    ret

Please help. Thank you.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions