Stream: git-wasmtime

Topic: wasmtime / issue #3963 Cranelift: Nonsensical duplication...


view this post on Zulip Wasmtime GitHub notifications bot (Mar 25 2022 at 18:00):

bjorn3 opened issue #3963:

.clif Test Case

function u0:0() -> i128 system_v {
    sig0 = (f32) -> i128 system_v
    fn0 = u0:1 sig0

block0:
    v0 = f32const +NaN
    v1 = call fn0(v0)
    v2 = fcmp.f32 eq v0, v0
    v3 = iconst.i128 0
    v4 = select v2, v1, v3
    return v4
}

Steps to Reproduce

Expected Results

No duplicate instructions.

Actual Results

Disassembly of 73 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 ff                xor     rdi, rdi
  2e:   48 31 f6                xor     rsi, rsi
  31:   0f 2e c8                ucomiss xmm1, xmm0
  34:   48 0f 45 c7             cmovne  rax, rdi
  38:   48 0f 45 c7             cmovne  rax, rdi
  3c:   48 0f 45 d6             cmovne  rdx, rsi
  40:   48 0f 45 d6             cmovne  rdx, rsi
  44:   48 89 ec                mov     rsp, rbp
  47:   5d                      pop     rbp
  48:   c3                      ret

The mov esi, 0x7fc00000, cmovne rax, rdi and cmovne rdx, rsi instructions are duplicated.

Versions and Environment

Cranelift version or commit: 105163cc15a152ac6f7f3bc5ebd57a6c6a09e7f3

Operating system: N/A

Architecture: x86_64

Extra Info

Regalloc input:

VCode_ShowWithRRU {{
  Entry block: 0
Block 0:
  (original IR block: block0)
  (instruction range: 0 .. 27)
  Inst 0:   movl    $2143289344, %v21Jl
  Inst 1:   movd    %v21Jl, %v20V
  Inst 2:   movaps  %v20V, %xmm0
  Inst 3:   load_ext_name u0:1+0, %v22J
  Inst 4:   call    *%v22J
  Inst 5:   movq    %rax, %v1J
  Inst 6:   movq    %rdx, %v2J
  Inst 7:   movl    $2143289344, %v11Jl
  Inst 8:   movd    %v11Jl, %v10V
  Inst 9:   movl    $2143289344, %v13Jl
  Inst 10:   movd    %v13Jl, %v12V
  Inst 11:   xorq    %v14J, %v14J
  Inst 12:   xorq    %v15J, %v15J
  Inst 13:   ucomiss %v10V, %v12V
  Inst 14:   movq    %v1J, %v18J
  Inst 15:   cmovnzq %v14J, %v18J
  Inst 16:   movq    %v18J, %v6J
  Inst 17:   cmovnzq %v14J, %v6J
  Inst 18:   movq    %v2J, %v19J
  Inst 19:   cmovnzq %v15J, %v19J
  Inst 20:   movq    %v19J, %v7J
  Inst 21:   cmovnzq %v15J, %v7J
  Inst 22:   movq    %v6J, %v8J
  Inst 23:   movq    %v7J, %v9J
  Inst 24:   movq    %v8J, %rax
  Inst 25:   movq    %v9J, %rdx
  Inst 26:   ret
}}

If I use i64 instead of i128 the second cmovne turns into cmovp:

Disassembly of 62 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 f6                xor     rsi, rsi
  2e:   0f 2e c8                ucomiss xmm1, xmm0
  31:   48 0f 45 c6             cmovne  rax, rsi
  35:   48 0f 4a c6             cmovp   rax, rsi
  39:   48 89 ec                mov     rsp, rbp
  3c:   5d                      pop     rbp
  3d:   c3                      ret

view this post on Zulip Wasmtime GitHub notifications bot (Mar 25 2022 at 18:00):

bjorn3 labeled issue #3963:

.clif Test Case

function u0:0() -> i128 system_v {
    sig0 = (f32) -> i128 system_v
    fn0 = u0:1 sig0

block0:
    v0 = f32const +NaN
    v1 = call fn0(v0)
    v2 = fcmp.f32 eq v0, v0
    v3 = iconst.i128 0
    v4 = select v2, v1, v3
    return v4
}

Steps to Reproduce

Expected Results

No duplicate instructions.

Actual Results

Disassembly of 73 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 ff                xor     rdi, rdi
  2e:   48 31 f6                xor     rsi, rsi
  31:   0f 2e c8                ucomiss xmm1, xmm0
  34:   48 0f 45 c7             cmovne  rax, rdi
  38:   48 0f 45 c7             cmovne  rax, rdi
  3c:   48 0f 45 d6             cmovne  rdx, rsi
  40:   48 0f 45 d6             cmovne  rdx, rsi
  44:   48 89 ec                mov     rsp, rbp
  47:   5d                      pop     rbp
  48:   c3                      ret

The mov esi, 0x7fc00000, cmovne rax, rdi and cmovne rdx, rsi instructions are duplicated.

Versions and Environment

Cranelift version or commit: 105163cc15a152ac6f7f3bc5ebd57a6c6a09e7f3

Operating system: N/A

Architecture: x86_64

Extra Info

Regalloc input:

VCode_ShowWithRRU {{
  Entry block: 0
Block 0:
  (original IR block: block0)
  (instruction range: 0 .. 27)
  Inst 0:   movl    $2143289344, %v21Jl
  Inst 1:   movd    %v21Jl, %v20V
  Inst 2:   movaps  %v20V, %xmm0
  Inst 3:   load_ext_name u0:1+0, %v22J
  Inst 4:   call    *%v22J
  Inst 5:   movq    %rax, %v1J
  Inst 6:   movq    %rdx, %v2J
  Inst 7:   movl    $2143289344, %v11Jl
  Inst 8:   movd    %v11Jl, %v10V
  Inst 9:   movl    $2143289344, %v13Jl
  Inst 10:   movd    %v13Jl, %v12V
  Inst 11:   xorq    %v14J, %v14J
  Inst 12:   xorq    %v15J, %v15J
  Inst 13:   ucomiss %v10V, %v12V
  Inst 14:   movq    %v1J, %v18J
  Inst 15:   cmovnzq %v14J, %v18J
  Inst 16:   movq    %v18J, %v6J
  Inst 17:   cmovnzq %v14J, %v6J
  Inst 18:   movq    %v2J, %v19J
  Inst 19:   cmovnzq %v15J, %v19J
  Inst 20:   movq    %v19J, %v7J
  Inst 21:   cmovnzq %v15J, %v7J
  Inst 22:   movq    %v6J, %v8J
  Inst 23:   movq    %v7J, %v9J
  Inst 24:   movq    %v8J, %rax
  Inst 25:   movq    %v9J, %rdx
  Inst 26:   ret
}}

If I use i64 instead of i128 the second cmovne turns into cmovp:

Disassembly of 62 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 f6                xor     rsi, rsi
  2e:   0f 2e c8                ucomiss xmm1, xmm0
  31:   48 0f 45 c6             cmovne  rax, rsi
  35:   48 0f 4a c6             cmovp   rax, rsi
  39:   48 89 ec                mov     rsp, rbp
  3c:   5d                      pop     rbp
  3d:   c3                      ret

view this post on Zulip Wasmtime GitHub notifications bot (Mar 25 2022 at 18:00):

bjorn3 labeled issue #3963:

.clif Test Case

function u0:0() -> i128 system_v {
    sig0 = (f32) -> i128 system_v
    fn0 = u0:1 sig0

block0:
    v0 = f32const +NaN
    v1 = call fn0(v0)
    v2 = fcmp.f32 eq v0, v0
    v3 = iconst.i128 0
    v4 = select v2, v1, v3
    return v4
}

Steps to Reproduce

Expected Results

No duplicate instructions.

Actual Results

Disassembly of 73 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 ff                xor     rdi, rdi
  2e:   48 31 f6                xor     rsi, rsi
  31:   0f 2e c8                ucomiss xmm1, xmm0
  34:   48 0f 45 c7             cmovne  rax, rdi
  38:   48 0f 45 c7             cmovne  rax, rdi
  3c:   48 0f 45 d6             cmovne  rdx, rsi
  40:   48 0f 45 d6             cmovne  rdx, rsi
  44:   48 89 ec                mov     rsp, rbp
  47:   5d                      pop     rbp
  48:   c3                      ret

The mov esi, 0x7fc00000, cmovne rax, rdi and cmovne rdx, rsi instructions are duplicated.

Versions and Environment

Cranelift version or commit: 105163cc15a152ac6f7f3bc5ebd57a6c6a09e7f3

Operating system: N/A

Architecture: x86_64

Extra Info

Regalloc input:

VCode_ShowWithRRU {{
  Entry block: 0
Block 0:
  (original IR block: block0)
  (instruction range: 0 .. 27)
  Inst 0:   movl    $2143289344, %v21Jl
  Inst 1:   movd    %v21Jl, %v20V
  Inst 2:   movaps  %v20V, %xmm0
  Inst 3:   load_ext_name u0:1+0, %v22J
  Inst 4:   call    *%v22J
  Inst 5:   movq    %rax, %v1J
  Inst 6:   movq    %rdx, %v2J
  Inst 7:   movl    $2143289344, %v11Jl
  Inst 8:   movd    %v11Jl, %v10V
  Inst 9:   movl    $2143289344, %v13Jl
  Inst 10:   movd    %v13Jl, %v12V
  Inst 11:   xorq    %v14J, %v14J
  Inst 12:   xorq    %v15J, %v15J
  Inst 13:   ucomiss %v10V, %v12V
  Inst 14:   movq    %v1J, %v18J
  Inst 15:   cmovnzq %v14J, %v18J
  Inst 16:   movq    %v18J, %v6J
  Inst 17:   cmovnzq %v14J, %v6J
  Inst 18:   movq    %v2J, %v19J
  Inst 19:   cmovnzq %v15J, %v19J
  Inst 20:   movq    %v19J, %v7J
  Inst 21:   cmovnzq %v15J, %v7J
  Inst 22:   movq    %v6J, %v8J
  Inst 23:   movq    %v7J, %v9J
  Inst 24:   movq    %v8J, %rax
  Inst 25:   movq    %v9J, %rdx
  Inst 26:   ret
}}

If I use i64 instead of i128 the second cmovne turns into cmovp:

Disassembly of 62 bytes:
   0:   55                      push    rbp
   1:   48 89 e5                mov     rbp, rsp
   4:   be 00 00 c0 7f          mov     esi, 0x7fc00000
   9:   66 0f 6e c6             movd    xmm0, esi
   d:   48 be 00 00 00 00 00 00 00 00
                                movabs  rsi, 0
  17:   ff d6                   call    rsi
  19:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  1e:   66 0f 6e c6             movd    xmm0, esi
  22:   be 00 00 c0 7f          mov     esi, 0x7fc00000
  27:   66 0f 6e ce             movd    xmm1, esi
  2b:   48 31 f6                xor     rsi, rsi
  2e:   0f 2e c8                ucomiss xmm1, xmm0
  31:   48 0f 45 c6             cmovne  rax, rsi
  35:   48 0f 4a c6             cmovp   rax, rsi
  39:   48 89 ec                mov     rsp, rbp
  3c:   5d                      pop     rbp
  3d:   c3                      ret

view this post on Zulip Wasmtime GitHub notifications bot (Mar 25 2022 at 18:09):

bjorn3 commented on issue #3963:

If I use i64 instead of i128 the second cmovne turns into cmovp:

Turns out the i128 version was a miscompilation. Splitting the i128 into two i64 and the individually selecting them makes my code behave as expected.


Last updated: Oct 23 2024 at 20:03 UTC