branch optimization, is there a flag for that? · cranelift

Hello! I am compiling a Scheme and there are some cases like primitive ops like #%heap-object? which are emitted separately from branch during compilation and cranelift produces code like this:

 47d:   41 80 fa 12             cmp    $0x12,%r10b
 481:   0f 94 c1                sete   %cl
 484:   84 c9                   test   %cl,%cl
 486:   0f 84 84 00 00 00       je     510 <fn0:false+0x220>

Is there any way to actually avoid that and get something more decent as in cmp and then directly je? Or is my only option to make changes to my codegen?

Alex Crichton (Aug 30 2025 at 02:26):

Would it be possible to extract the CLIF that leads to this codegen? This might be a missing pattern in optimizations or a missing pattern in lowering, and the CLIF can help narrow it down as to which

Adel Prokurov (Aug 30 2025 at 02:34):

I would not be able to extract it at the moment, but it looks like this during codegen:

fn is_false(ssa: &mut SSABuilder, val: ir::Value) -> ir::Value {
     let is_false = ssa.builder.ins().icmp_imm(IntCC::Equal, val, Value::VALUE_FALSE);
     let false_ = ssa.builder.ins().iconst(types::I64, Value::VALUE_FALSE);
     let true_ = ssa.builder.ins().iconst(types::I64, Value::VALUE_TRUE);
     ssa.builder.ins().select(is_false, false_, true_)
}

fn emit_if(ssa: &mut SSABuilder, var: LVar, kcons: LVar, kalt: LVar) {
     let check = is_false(ssa, ssa.var(var));
     let is_false = builder.ins().icmp_imm(IntCC::Equal, is_false, Value::VALUE_FALSE);
     ssa.builder.ins().brif(is_false, ...)
}

Adel Prokurov (Aug 30 2025 at 02:35):

Adel Prokurov (Aug 30 2025 at 02:36):

I got a half working solution at the moment but I was expecting Cranelift being able to rewrite such combination of select + brif

Alex Crichton (Aug 30 2025 at 03:26):

function %a(i32) -> i64 {
block0(v0: i32):
  v1 = icmp_imm eq v0, 0
  v2 = iconst.i64 0
  v3 = iconst.i64 1
  v4 = select v1, v2, v3
  v5 = icmp_imm eq v4, 0
  brif v5, block1, block2

block1:
  v6 = iconst.i64 1
  return v6

block2:
  v7 = iconst.i64 2
  return v7
}

shows what I would expect with cargo run -p cranelift-tools compile foo.clif --target x86_64 -D --set opt_level=speed:

Disassembly of 32 bytes <%a>:
   0:   55                      pushq   %rbp
   1:   48 89 e5                movq    %rsp, %rbp
   4:   85 ff                   testl   %edi, %edi
   6:   0f 84 0a 00 00 00       je      0x16
   c:   b8 02 00 00 00          movl    $2, %eax
  11:   48 89 ec                movq    %rbp, %rsp
  14:   5d                      popq    %rbp
  15:   c3                      retq
  16:   b8 01 00 00 00          movl    $1, %eax
  1b:   48 89 ec                movq    %rbp, %rsp
  1e:   5d                      popq    %rbp
  1f:   c3                      retq

Adel Prokurov (Aug 30 2025 at 03:27):

Adel Prokurov (Aug 30 2025 at 03:32):

block2:
    v21 = get_pinned_reg.i64
    v50 = iconst.i64 -562949953421310
    v24 = band.i64 v9, v50  ; v50 = -562949953421310
    v49 = iconst.i64 0
    v25 = icmp ne v24, v49  ; v49 = 0
    v23 = iconst.i8 0
    brif v25, block7(v23), block6  ; v23 = 0

 844:   4d 89 ee                mov    %r13,%r14
 847:   4c 89 ff                mov    %r15,%rdi
 84a:   45 33 c9                xor    %r9d,%r9d
 84d:   48 85 35 d4 00 00 00    test   %rsi,0xd4(%rip)        # 928 <cont1:after_call17+0x110>
 854:   0f 85 0a 00 00 00       jne    864 <cont1:after_call17+0x4c>
 85a:   48 0f b6 06             movzbq (%rsi),%rax
 85e:   3c 12                   cmp    $0x12,%al
 860:   41 0f 94 c1             sete   %r9b
 864:   45 84 c9                test   %r9b,%r9b
 867:   0f 84 84 00 00 00       je     8f1 <cont1:after_call17+0xd9>

Adel Prokurov (Aug 30 2025 at 03:33):

Adel Prokurov (Aug 30 2025 at 03:38):

block2:
    v21 = get_pinned_reg.i64
    v22 = call fn5(v21)
    v23 = symbol_value.i64 gv0
    store v22, v23
    v28 = get_pinned_reg.i64
    v27 = func_addr.i64 fn10
    v81 = iconst.i64 0
    v20 = iconst.i64 1
    v29 = call fn9(v28, v27, v81, v20)  ; v81 = 0, v20 = 1
    v30 = load.i64 v29+16
    v31 = symbol_value.i64 gv1
    v32 = load.i64 v31
    v33 = get_pinned_reg.i64
    v34 = call fn6(v33, v32, v29)
    v35 = symbol_value.i64 gv2
    v36 = load.i64 v35
    v79 = iconst.i64 -562949953421310
    v37 = band v36, v79  ; v79 = -562949953421310
    v38 = icmp eq v37, v81  ; v81 = 0
    v40 = iconst.i64 7
    v39 = iconst.i64 6
    v41 = select v38, v40, v39  ; v40 = 7, v39 = 6
    brif v41, block6, block7

 371:   4c 8b 1d 50 2c 00 00    mov    0x2c50(%rip),%r11        # 2fc8 <capy_define@Base>
 378:   48 89 c2                mov    %rax,%rdx
 37b:   41 ff d3                call   *%r11
 37e:   48 8d 1d 8b 2c 00 00    lea    0x2c8b(%rip),%rbx        # 3010 <cache_cell1>
 385:   4c 8b 1b                mov    (%rbx),%r11
 388:   41 ba 06 00 00 00       mov    $0x6,%r10d
 38e:   4c 85 1d 0b 01 00 00    test   %r11,0x10b(%rip)        # 4a0 <fn0:false+0x1b0>
 395:   4c 0f 44 15 0b 01 00    cmove  0x10b(%rip),%r10        # 4a8 <fn0:false+0x1b8>
 39c:   00
 39d:   4d 85 d2                test   %r10,%r10
 3a0:   0f 85 4b 00 00 00       jne    3f1 <fn0:false+0x101>

Alex Crichton (Aug 30 2025 at 03:40):

Adel Prokurov (Aug 30 2025 at 03:41):

Adel Prokurov (Aug 30 2025 at 03:43):

how can I dump an assembly with cranelift-jit though? Or is my only option using cranelift-object?

Alex Crichton (Aug 30 2025 at 03:47):

Adel Prokurov (Aug 30 2025 at 03:53):

function u0:0(i64, i64, i64) -> i64 tail {
    sig0 = (i64, i64) -> i64 tail
    sig1 = (i64) -> i64 tail

block0(v0: i64, v1: i64, v2: i64):
    v3 = band_imm v2, -562949953421310
    v4 = icmp_imm eq v3, 0
    v5 = iconst.i64 6
    v6 = iconst.i64 7
    v7 = select v4, v6, v5  ; v6 = 7, v5 = 6
    v8 = icmp_imm eq v7, 6
    brif v8, block2, block1

block1:
    return_call_indirect.i64 sig0, v0(v1, v2)

block2:
    return_call_indirect.i64 sig1, v0(v2)

0000000000000000 <.Lfn0>:
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   49 89 fa                mov    %rdi,%r10
   7:   41 b9 06 00 00 00       mov    $0x6,%r9d
   d:   48 85 15 2c 00 00 00    test   %rdx,0x2c(%rip)        # 40 <.Lfn0+0x40>
  14:   49 89 d3                mov    %rdx,%r11
  17:   4c 0f 44 0d 29 00 00    cmove  0x29(%rip),%r9        # 48 <.Lfn0+0x48>
  1e:   00
  1f:   49 83 f9 06             cmp    $0x6,%r9
  23:   0f 84 0d 00 00 00       je     36 <.Lfn0+0x36>
  29:   48 89 f7                mov    %rsi,%rdi
  2c:   4c 89 de                mov    %r11,%rsi
  2f:   48 89 ec                mov    %rbp,%rsp
  32:   5d                      pop    %rbp
  33:   41 ff e2                jmp    *%r10
  36:   4c 89 df                mov    %r11,%rdi
  39:   48 89 ec                mov    %rbp,%rsp
  3c:   5d                      pop    %rbp
  3d:   41 ff e2                jmp    *%r10

Alex Crichton (Aug 30 2025 at 04:07):

function %a(i64) -> i64 {
block0(v2: i64):
    v3 = band_imm v2, -562949953421310
    v4 = icmp_imm eq v3, 0
    v5 = iconst.i64 6
    v6 = iconst.i64 7
    v7 = select v4, v6, v5  ; v6 = 7, v5 = 6
    v8 = icmp_imm eq v7, 6
    brif v8, block2, block1

block1:
    v9 = iconst.i64 100
    jump block3(v9)

block2:
    v10 = iconst.i64 101
    jump block3(v10)

block3(v11: i64):
    return v11
}

Looks like we don't have an optimization rule for this, no, but shouldn't be too hard to add! Mind filing an issue for this?

Adel Prokurov (Aug 30 2025 at 04:07):

Adel Prokurov (Aug 30 2025 at 04:12):

Cranelift: missing optimization on icmp + select + icmp + brif · Issue #11578 · bytecodealliance/wasmtime

Thanks for filing an issue! Please fill out the TODOs below. .clif Test Case function %a(i64) -> i64 { block0(v2: i64): v3 = band_imm v2, -562949953421310 v4 = icmp_imm eq v3, 0 v5 = iconst.i64 6 v...

Stream: cranelift

Topic: branch optimization, is there a flag for that?

Adel Prokurov (Aug 30 2025 at 01:56):