Stream: git-cranelift

Topic: cranelift / Issue #983 Change in register allocation pref...


view this post on Zulip GitHub (Feb 28 2020 at 23:26):

alexcrichton transferred Issue #983:

To start, some clif:

; guest_func_10
function u0:10(i64 vmctx, i32, i32) -> i32 system_v {
    sig0 = (i64 vmctx, i32, i32) -> i32 system_v
    fn0 = colocated u0:10 sig0

                                ebb0(v0: i64, v1: i32, v2: i32):
@09ca                               v4 = iconst.i32 0
@09d0                               v5 = icmp_imm eq v1, 0
@09d0                               v6 = bint.i32 v5
@09d1                               brnz v6, ebb2(v2)
@09d3                               jump ebb3(v1, v2)

                                ebb3(v7: i32, v10: i32):
@09d7                               v8 = iconst.i32 -1
@09d9                               v9 = iadd v7, v8
@09de                               brz v10, ebb5
@09e4                               v11 = iconst.i32 -1
@09e6                               v12 = iadd v10, v11
@09e7                               v13 = call fn0(v0, v7, v12)
@09ef                               brnz v9, ebb3(v9, v13)
@09f1                               jump ebb2(v13)

                                ebb5:
@09f4                               v14 = iconst.i32 1
@09fc                               brnz.i32 v9, ebb3(v9, v14)
@09fe                               jump ebb4

                                ebb4:
@09ff                               jump ebb2(v14)

                                ebb2(v15: i32):
@0a02                               v16 = iconst.i32 1
@0a04                               v17 = iadd v15, v16
@0a05                               jump ebb1(v17)

                                ebb1(v3: i32):
@0a05                               return v3

this had been producing

sym.guest_func_10 ();
           0x00003ac0 push rbp
           0x00003ac1 mov rbp, rsp
           0x00003ac4 sub rsp, 0x10
           0x00003ac8 mov qword [rsp], rdi
           0x00003ad0 mov dword [local_ch], edx
           0x00003ad7 test esi, esi
       ,=< 0x00003ad9 je 0x3b56
       |   0x00003adb mov eax, 1
       |   0x00003ae0 mov dword [local_ch], eax
     ..--> 0x00003ae7 mov eax, esi
     ::|   0x00003ae9 add eax, -1
     ::|   0x00003aec mov dword [local_8h], eax
     ::|   0x00003af3 test edx, edx
    ,====< 0x00003af5 je 0x3b31
    |::|   0x00003af7 add edx, -1
    |::|   0x00003afa mov rax, qword [rsp]
    |::|   0x00003b02 mov rdi, rax
    |::|   0x00003b05 call sym.guest_func_10
    |::|   0x00003b0a mov ecx, dword [local_8h]
    |::|   0x00003b11 mov edx, dword [local_8h]
    |::|   0x00003b18 mov esi, ecx
    |::|   0x00003b1b mov ecx, edx
    |::|   0x00003b1e mov edx, eax
    |::|   0x00003b21 mov eax, ecx
    |::|   0x00003b24 test eax, eax
    |`===< 0x00003b26 jne 0x3ae7
    | :|   0x00003b28 mov dword [local_ch], edx
    |,===< 0x00003b2f jmp 0x3b56
    `----> 0x00003b31 mov eax, dword [local_ch]
     |:|   0x00003b38 mov ecx, dword [local_8h]
     |:|   0x00003b3f mov edx, dword [local_8h]
     |:|   0x00003b46 mov esi, ecx
     |:|   0x00003b49 mov ecx, edx
     |:|   0x00003b4c mov edx, eax
     |:|   0x00003b4f mov eax, ecx
     |:|   0x00003b52 test eax, eax
     |`==< 0x00003b54 jne 0x3ae7
     `-`-> 0x00003b56 mov eax, dword [local_ch]
           0x00003b5d add eax, 1
           0x00003b60 add rsp, 0x10
           0x00003b64 pop rbp
           0x00003b65 ret

with redundant reloads

in 164f91a the redundant reloads are removed (:tada:) but ends up picking (on x86-64) callee-save registers to load into, then producing

sym.guest_func_10 ();
           0x00003ac0 push rbp
           0x00003ac1 mov rbp, rsp
           0x00003ac4 push r12
           0x00003ac6 push r13
           0x00003ac8 push r14
           0x00003aca sub rsp, 0x18
           0x00003ace mov qword [local_8h], rdi
           0x00003ad6 mov dword [local_14h], edx
           0x00003add test esi, esi
       ,=< 0x00003adf je 0x3b48
       |   0x00003ae1 mov eax, 1
       |   0x00003ae6 mov dword [local_14h], eax
     ..--> 0x00003aed mov eax, esi
     ::|   0x00003aef add eax, -1
     ::|   0x00003af2 mov dword [local_10h], eax
     ::|   0x00003af9 test edx, edx
    ,====< 0x00003afb je 0x3b2f
    |::|   0x00003afd add edx, -1
    |::|   0x00003b00 mov r14, qword [local_8h]
    |::|   0x00003b08 mov rdi, r14
    |::|   0x00003b0b call sym.guest_func_10
    |::|   0x00003b10 mov r14d, dword [local_10h]
    |::|   0x00003b18 mov r13d, r14d
    |::|   0x00003b1b mov esi, r14d
    |::|   0x00003b1e mov edx, eax
    |::|   0x00003b21 test r13d, r13d
    |`===< 0x00003b24 jne 0x3aed
    | :|   0x00003b26 mov dword [local_14h], edx
    |,===< 0x00003b2d jmp 0x3b48
    `----> 0x00003b2f mov r14d, dword [local_14h]
     |:|   0x00003b37 mov r13d, eax
     |:|   0x00003b3a mov r12d, eax
     |:|   0x00003b3d mov esi, r13d
     |:|   0x00003b40 mov edx, r14d
     |:|   0x00003b43 test r12d, r12d
     |`==< 0x00003b46 jne 0x3aed
     `-`-> 0x00003b48 mov r14d, dword [local_14h]
           0x00003b50 add r14d, 1
           0x00003b54 mov eax, r14d
           0x00003b57 add rsp, 0x18
           0x00003b5b pop r14
           0x00003b5d pop r13
           0x00003b5f pop r12
           0x00003b61 pop rbp
           0x00003b62 ret

because of the callee-save use here we end up net +3 memory accesses from push/pops, even though three redundant loads were removed

In a particularly contorted benchmark (Lucet's ackermann benchmark) this ends up dropping runtime by ~30%, though I think in most cases it's probably about the same before and after.

ediit: didn't include addresses initially, which didn't read well


Last updated: Oct 23 2024 at 20:03 UTC