wasmtime / issue #5957 Invariant loads are not hoisted ou... · git-wasmtime

Stream: git-wasmtime

Topic: wasmtime / issue #5957 Invariant loads are not hoisted ou...

Wasmtime GitHub notifications bot (Mar 07 2023 at 22:13):

alexcrichton opened issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.

Wasmtime GitHub notifications bot (Mar 07 2023 at 22:13):

alexcrichton labeled issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.

Wasmtime GitHub notifications bot (Mar 07 2023 at 22:59):

cfallin commented on issue #5957:

Ideally this line should cause loads with notrap and readonly to be considered pure, and thus hoistable. I can look into this eventually if someone else doesn't get to it first (just wanted to point to that line as a starting point).

Wasmtime GitHub notifications bot (Mar 07 2023 at 23:29):

alexcrichton commented on issue #5957:

Brief debugging my part reveals that line is working as intended but here, during loop hoisting, the argument to the load instruction, v0, is listed as:
 629 [cranelift/codegen/src/egraph/elaborate.rs:408] &value = ElaboratedValue {
 630     in_block: block2,
 631     value: v0,
 632 }
where I think in_block: block2 is preventing the hoist?

Wasmtime GitHub notifications bot (Mar 07 2023 at 23:36):

alexcrichton commented on issue #5957:

While I'm not sure that https://github.com/bytecodealliance/wasmtime/pull/5960 is 100% correct it fixes the issue here at least

Wasmtime GitHub notifications bot (Mar 08 2023 at 00:35):

alexcrichton closed issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.

Last updated: Apr 18 2025 at 13:08 UTC