Stream: git-wasmtime

Topic: wasmtime / issue #5957 Invariant loads are not hoisted ou...


view this post on Zulip Wasmtime GitHub notifications bot (Mar 07 2023 at 22:13):

alexcrichton opened issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.

view this post on Zulip Wasmtime GitHub notifications bot (Mar 07 2023 at 22:13):

alexcrichton labeled issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.

view this post on Zulip Wasmtime GitHub notifications bot (Mar 07 2023 at 22:59):

cfallin commented on issue #5957:

Ideally this line should cause loads with notrap and readonly to be considered pure, and thus hoistable. I can look into this eventually if someone else doesn't get to it first (just wanted to point to that line as a starting point).

view this post on Zulip Wasmtime GitHub notifications bot (Mar 07 2023 at 23:29):

alexcrichton commented on issue #5957:

Brief debugging my part reveals that line is working as intended but here, during loop hoisting, the argument to the load instruction, v0, is listed as:

 629 [cranelift/codegen/src/egraph/elaborate.rs:408] &value = ElaboratedValue {
 630     in_block: block2,
 631     value: v0,
 632 }

where I think in_block: block2 is preventing the hoist?

view this post on Zulip Wasmtime GitHub notifications bot (Mar 07 2023 at 23:36):

alexcrichton commented on issue #5957:

While I'm not sure that https://github.com/bytecodealliance/wasmtime/pull/5960 is 100% correct it fixes the issue here at least

view this post on Zulip Wasmtime GitHub notifications bot (Mar 08 2023 at 00:35):

alexcrichton closed issue #5957:

Currently Cranelift will not hoist invariant loads outside of loops, meaning that an invariant load is reloaded on each iteration of the loop. For example with this code:

(module
  (memory 0)
  (func $add_all (param $ptr i32) (param $len i32) (result i32)
    (local $result i32)
    loop $l
      (local.set $result
        (i32.add
          (local.get $result)
          (i32.load (local.get $ptr))
        )
      )

      (local.set $ptr
        (i32.add
          (local.get $ptr)
          (i32.const 4)
        )
      )

      (br_if $l
        (local.tee $len
          (i32.sub
            (local.get $len)
            (i32.const 1)
          )
        )
      )
    end
    (local.get $result)
  )
)

this machine code is generated on x86_64:

$ wasmtime compile foo.wat && objdump -S foo.cwasm

foo.cwasm:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_wasm_function_0>:
       0:       55                      push   %rbp
       1:       48 89 e5                mov    %rsp,%rbp
       4:       31 c0                   xor    %eax,%eax
       6:       49 89 c9                mov    %rcx,%r9
       9:       48 8b 4f 50             mov    0x50(%rdi),%rcx
       d:       44 8b c2                mov    %edx,%r8d
      10:       42 8b 4c 01 00          mov    0x0(%rcx,%r8,1),%ecx
      15:       41 83 e9 01             sub    $0x1,%r9d
      19:       01 c8                   add    %ecx,%eax
      1b:       83 c2 04                add    $0x4,%edx
      1e:       45 85 c9                test   %r9d,%r9d
      21:       0f 85 e2 ff ff ff       jne    9 <_wasm_function_0+0x9>
      27:       48 89 ec                mov    %rbp,%rsp
      2a:       5d                      pop    %rbp
      2b:       c3                      retq
        ...

The instruction at 9 is loading the base pointer of memory from the VMContext, in %rdi, into %rcx. This load happens on all iterations of the loop, however.

The Cranelift function being lowered here is (according to RUST_LOG):

function u0:0(i64 vmctx, i64, i32, i32) -> i32 fast {
    gv0 = vmctx
    gv1 = load.i64 notrap aligned readonly gv0+8
    gv2 = load.i64 notrap aligned gv1
    gv3 = vmctx
    gv4 = load.i64 notrap aligned readonly gv3+80
    stack_limit = gv2

                                block0(v0: i64, v1: i64, v2: i32, v3: i32):
@001f                               v5 = iconst.i32 0
@0021                               jump block2(v5, v2, v3)  ; v5 = 0

                                block2(v6: i32, v7: i32, v15: i32):
@0027                               v9 = load.i64 notrap aligned readonly v0+80
@0027                               v8 = uextend.i64 v7
@0027                               v10 = iadd v9, v8
@0027                               v11 = load.i32 little heap v10
@0036                               v16 = iconst.i32 1
@0038                               v17 = isub v15, v16  ; v16 = 1
@002a                               v12 = iadd v6, v11
                                    v4 -> v12
@002f                               v13 = iconst.i32 4
@0031                               v14 = iadd v7, v13  ; v13 = 4
@003b                               brif v17, block2(v12, v14, v17), block4

                                block4:
@003d                               jump block3

                                block3:
@0040                               jump block1

                                block1:
@0040                               return v12
}

Specifically the instruction v9 = load.i64 notrap aligned readonly v0+80 is not hoisted outside of the loop, despite it being readonly, notrap, and v0 not being dependent on any loop value.


Last updated: Nov 22 2024 at 16:03 UTC