Stream: git-wasmtime

Topic: wasmtime / issue #7976 Cranelift: unknown memory problem ...


view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 01:02):

meijies added the cranelift label to Issue #7976.

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 01:02):

meijies added the bug label to Issue #7976.

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 01:02):

meijies opened issue #7976:

Summary

I am try to compile and execute a function which execute expression (a + b) / c < d on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found.

failure .clif Test Case for F64X2

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %rax
  vpshufd $68, %xmm0, %xmm3
  vpshufd $68, %xmm1, %xmm2
  jmp     label1
block1:
  vmovupd 0(%rdi), %xmm4
  vaddpd  %xmm4, 0(%rsi), %xmm4
  vdivpd  %xmm4, %xmm3, %xmm4
  vcmppd  $1, %xmm4, %xmm2, %xmm4
  vmovdqu %xmm4, 0(%rdx)
  lea     1(%rcx), %rcx
  lea     16(%rdi), %rdi
  lea     16(%rsi), %rsi
  lea     2(%rdx), %rdx
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

  1. the test cases.
#[test]
    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
       // jit compiled function.
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)

  1. jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) {
    let mut ctx = CodegenContext::builder().debug().finish();
    let data_type = types::F64X2;
    let result_type = types::I8X2;

    let mut func_ctx = ctx.create_func_gen_ctx(
        "op_v3",
        vec![
            AbiParam::new(ctx.ptype()),
            AbiParam::new(ctx.ptype()),
            AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn),
            AbiParam::new(types::F64),
            AbiParam::new(types::F64),
            AbiParam::new(types::I64),
            AbiParam::new(types::I64),
        ],
        vec![],
    );
    let entry_block = func_ctx.builder.create_block();
    let body_block = func_ctx.builder.create_block();
    let exit_block = func_ctx.builder.create_block();

    func_ctx.builder.switch_to_block(entry_block);
    func_ctx
        .builder
        .append_block_params_for_function_params(entry_block);
    let p1 = func_ctx.builder.block_params(entry_block)[0];
    let p2 = func_ctx.builder.block_params(entry_block)[1];
    let p3 = func_ctx.builder.block_params(entry_block)[2];
    let p4 = func_ctx.builder.block_params(entry_block)[3];
    let p5 = func_ctx.builder.block_params(entry_block)[4];
    let p6 = func_ctx.builder.block_params(entry_block)[5];
    let p7 = func_ctx.builder.block_params(entry_block)[6];

    let p4 = func_ctx.builder.ins().splat(data_type, p4);
    let p5 = func_ctx.builder.ins().splat(data_type, p5);

    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx
        .builder
        .ins()
        .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]);
    func_ctx.builder.seal_block(entry_block);

    func_ctx.builder.switch_to_block(body_block);
    let lhs_ref = func_ctx.builder.block_params(body_block)[0];
    let rhs_ref = func_ctx.builder.block_params(body_block)[1];
    let result_ref = func_ctx.builder.block_params(body_block)[2];
    let to_div = func_ctx.builder.block_params(body_block)[3];
    let to_lt = func_ctx.builder.block_params(body_block)[4];
    let start = func_ctx.builder.block_params(body_block)[5];
    let end = func_ctx.builder.block_params(body_block)[6];

    let lhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), lhs_ref, 0);
    let rhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), rhs_ref, 0);
    let sum = func_ctx.builder.ins().fadd(lhs, rhs);
    let div_result = func_ctx.builder.ins().fdiv(sum, to_div);
    let result = func_ctx
        .builder
        .ins()
        .fcmp(FloatCC::LessThan, div_result, to_lt);

    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), result, result_ref, 0);

    let offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, data_type.bytes() as i64);

    let result_offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, result_type.bytes() as i64);

    let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref);
    let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref);
    let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref);

    let next_start = func_ctx.builder.ins().iadd_imm(start, 1);
    let cond = func_ctx
        .builder
        .ins()
        .icmp(IntCC::SignedLessThan, next_start, end);
    func_ctx.builder.ins().brif(
        cond,
        body_block,
        &[
            next_lhs_ref,
            next_rhs_ref,
            next_result_ref,
            to_div,
            to_lt,
            next_start,
            end,
        ],
        exit_block,
        &[],
    );
    func_ctx.builder.switch_to_block(exit_block);
    let func_id = func_ctx.finalize(&[]);
    let code = ctx.finalize(func_id);
    unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) }
}
  1. call jit comiled function
pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64);
    unsafe {
        res.set_len((a.len() / 2) * 2);
    }
    let buffer = BooleanBuffer::from_iter(res);
    Ok(BooleanArray::new(buffer, nulls))
}

sucessful .clif Test Case for F64

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %r9
  movq    %r9, %rax
  jmp     label1
block1:
  vmovsd  0(%rdi), %xmm2
  vaddsd  %xmm2, 0(%rsi), %xmm2
  vdivsd  %xmm2, %xmm0, %xmm2
  ucomisd %xmm2, %xmm1
  setnbe  %r10b
  movb    %r10b, 0(%rax)
  lea     1(%rcx), %rcx
  lea     8(%rdi), %rdi
  lea     8(%rsi), %rsi
  lea     1(%rax), %rax
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %r9, %rax
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

1) the test cases.

    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity(a.len());
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len()) as i64);
    unsafe {
        res.set_len(a.len());
    }
    let buffer = BooleanBuffer::from_iter(res);
    Ok(Boole
[message truncated]

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 01:07):

meijies edited issue #7976:

Summary

I am try to compile and execute a function which execute expression (a + b) / c < d on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found.

failure .clif Test Case for F64X2

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %rax
  vpshufd $68, %xmm0, %xmm3
  vpshufd $68, %xmm1, %xmm2
  jmp     label1
block1:
  vmovupd 0(%rdi), %xmm4
  vaddpd  %xmm4, 0(%rsi), %xmm4
  vdivpd  %xmm4, %xmm3, %xmm4
  vcmppd  $1, %xmm4, %xmm2, %xmm4
  vmovdqu %xmm4, 0(%rdx)
  lea     1(%rcx), %rcx
  lea     16(%rdi), %rdi
  lea     16(%rsi), %rsi
  lea     2(%rdx), %rdx
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

  1. the test cases.
#[test]
    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
       // jit compiled function.
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)

  1. jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) {
    let mut ctx = CodegenContext::builder().debug().finish();
    let data_type = types::F64X2;
    let result_type = types::I8X2;

    let mut func_ctx = ctx.create_func_gen_ctx(
        "op_v3",
        vec![
            AbiParam::new(ctx.ptype()),
            AbiParam::new(ctx.ptype()),
            AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn),
            AbiParam::new(types::F64),
            AbiParam::new(types::F64),
            AbiParam::new(types::I64),
            AbiParam::new(types::I64),
        ],
        vec![],
    );
    let entry_block = func_ctx.builder.create_block();
    let body_block = func_ctx.builder.create_block();
    let exit_block = func_ctx.builder.create_block();

    func_ctx.builder.switch_to_block(entry_block);
    func_ctx
        .builder
        .append_block_params_for_function_params(entry_block);
    let p1 = func_ctx.builder.block_params(entry_block)[0];
    let p2 = func_ctx.builder.block_params(entry_block)[1];
    let p3 = func_ctx.builder.block_params(entry_block)[2];
    let p4 = func_ctx.builder.block_params(entry_block)[3];
    let p5 = func_ctx.builder.block_params(entry_block)[4];
    let p6 = func_ctx.builder.block_params(entry_block)[5];
    let p7 = func_ctx.builder.block_params(entry_block)[6];

    let p4 = func_ctx.builder.ins().splat(data_type, p4);
    let p5 = func_ctx.builder.ins().splat(data_type, p5);

    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx
        .builder
        .ins()
        .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]);
    func_ctx.builder.seal_block(entry_block);

    func_ctx.builder.switch_to_block(body_block);
    let lhs_ref = func_ctx.builder.block_params(body_block)[0];
    let rhs_ref = func_ctx.builder.block_params(body_block)[1];
    let result_ref = func_ctx.builder.block_params(body_block)[2];
    let to_div = func_ctx.builder.block_params(body_block)[3];
    let to_lt = func_ctx.builder.block_params(body_block)[4];
    let start = func_ctx.builder.block_params(body_block)[5];
    let end = func_ctx.builder.block_params(body_block)[6];

    let lhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), lhs_ref, 0);
    let rhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), rhs_ref, 0);
    let sum = func_ctx.builder.ins().fadd(lhs, rhs);
    let div_result = func_ctx.builder.ins().fdiv(sum, to_div);
    let result = func_ctx
        .builder
        .ins()
        .fcmp(FloatCC::LessThan, div_result, to_lt);

    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), result, result_ref, 0);

    let offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, data_type.bytes() as i64);

    let result_offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, result_type.bytes() as i64);

    let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref);
    let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref);
    let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref);

    let next_start = func_ctx.builder.ins().iadd_imm(start, 1);
    let cond = func_ctx
        .builder
        .ins()
        .icmp(IntCC::SignedLessThan, next_start, end);
    func_ctx.builder.ins().brif(
        cond,
        body_block,
        &[
            next_lhs_ref,
            next_rhs_ref,
            next_result_ref,
            to_div,
            to_lt,
            next_start,
            end,
        ],
        exit_block,
        &[],
    );
    func_ctx.builder.switch_to_block(exit_block);
    let func_id = func_ctx.finalize(&[]);
    let code = ctx.finalize(func_id);
    unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) }
}
  1. call jit comiled function
pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64);
    unsafe {
        res.set_len((a.len() / 2) * 2);
    }
    let buffer = BooleanBuffer::from_iter(res);
    Ok(BooleanArray::new(buffer, nulls))
}

also attch workable version.

sucessful .clif Test Case for F64

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %r9
  movq    %r9, %rax
  jmp     label1
block1:
  vmovsd  0(%rdi), %xmm2
  vaddsd  %xmm2, 0(%rsi), %xmm2
  vdivsd  %xmm2, %xmm0, %xmm2
  ucomisd %xmm2, %xmm1
  setnbe  %r10b
  movb    %r10b, 0(%rax)
  lea     1(%rcx), %rcx
  lea     8(%rdi), %rdi
  lea     8(%rsi), %rsi
  lea     1(%rax), %rax
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %r9, %rax
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

1) the test cases.

    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity(a.len());
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len()) as i64);
    unsafe {
        res.set_len(a.len());
    }
    let buffer = Boole
[message truncated]

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 01:10):

meijies edited issue #7976:

Summary

I am try to compile and execute a function which execute expression (a + b) / c < d on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found. so the biggest problem is that I don't know how to troubleshoot this type of issue.

failure .clif Test Case for F64X2

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %rax
  vpshufd $68, %xmm0, %xmm3
  vpshufd $68, %xmm1, %xmm2
  jmp     label1
block1:
  vmovupd 0(%rdi), %xmm4
  vaddpd  %xmm4, 0(%rsi), %xmm4
  vdivpd  %xmm4, %xmm3, %xmm4
  vcmppd  $1, %xmm4, %xmm2, %xmm4
  vmovdqu %xmm4, 0(%rdx)
  lea     1(%rcx), %rcx
  lea     16(%rdi), %rdi
  lea     16(%rsi), %rsi
  lea     2(%rdx), %rdx
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

  1. the test cases.
#[test]
    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
       // jit compiled function.
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)

  1. jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) {
    let mut ctx = CodegenContext::builder().debug().finish();
    let data_type = types::F64X2;
    let result_type = types::I8X2;

    let mut func_ctx = ctx.create_func_gen_ctx(
        "op_v3",
        vec![
            AbiParam::new(ctx.ptype()),
            AbiParam::new(ctx.ptype()),
            AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn),
            AbiParam::new(types::F64),
            AbiParam::new(types::F64),
            AbiParam::new(types::I64),
            AbiParam::new(types::I64),
        ],
        vec![],
    );
    let entry_block = func_ctx.builder.create_block();
    let body_block = func_ctx.builder.create_block();
    let exit_block = func_ctx.builder.create_block();

    func_ctx.builder.switch_to_block(entry_block);
    func_ctx
        .builder
        .append_block_params_for_function_params(entry_block);
    let p1 = func_ctx.builder.block_params(entry_block)[0];
    let p2 = func_ctx.builder.block_params(entry_block)[1];
    let p3 = func_ctx.builder.block_params(entry_block)[2];
    let p4 = func_ctx.builder.block_params(entry_block)[3];
    let p5 = func_ctx.builder.block_params(entry_block)[4];
    let p6 = func_ctx.builder.block_params(entry_block)[5];
    let p7 = func_ctx.builder.block_params(entry_block)[6];

    let p4 = func_ctx.builder.ins().splat(data_type, p4);
    let p5 = func_ctx.builder.ins().splat(data_type, p5);

    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx
        .builder
        .append_block_param(body_block, func_ctx.ptype);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, data_type);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx.builder.append_block_param(body_block, types::I64);
    func_ctx
        .builder
        .ins()
        .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]);
    func_ctx.builder.seal_block(entry_block);

    func_ctx.builder.switch_to_block(body_block);
    let lhs_ref = func_ctx.builder.block_params(body_block)[0];
    let rhs_ref = func_ctx.builder.block_params(body_block)[1];
    let result_ref = func_ctx.builder.block_params(body_block)[2];
    let to_div = func_ctx.builder.block_params(body_block)[3];
    let to_lt = func_ctx.builder.block_params(body_block)[4];
    let start = func_ctx.builder.block_params(body_block)[5];
    let end = func_ctx.builder.block_params(body_block)[6];

    let lhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), lhs_ref, 0);
    let rhs = func_ctx
        .builder
        .ins()
        .load(data_type, MemFlags::new(), rhs_ref, 0);
    let sum = func_ctx.builder.ins().fadd(lhs, rhs);
    let div_result = func_ctx.builder.ins().fdiv(sum, to_div);
    let result = func_ctx
        .builder
        .ins()
        .fcmp(FloatCC::LessThan, div_result, to_lt);

    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), result, result_ref, 0);

    let offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, data_type.bytes() as i64);

    let result_offset = func_ctx
        .builder
        .ins()
        .iconst(types::I64, result_type.bytes() as i64);

    let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref);
    let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref);
    let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref);

    let next_start = func_ctx.builder.ins().iadd_imm(start, 1);
    let cond = func_ctx
        .builder
        .ins()
        .icmp(IntCC::SignedLessThan, next_start, end);
    func_ctx.builder.ins().brif(
        cond,
        body_block,
        &[
            next_lhs_ref,
            next_rhs_ref,
            next_result_ref,
            to_div,
            to_lt,
            next_start,
            end,
        ],
        exit_block,
        &[],
    );
    func_ctx.builder.switch_to_block(exit_block);
    let func_id = func_ctx.finalize(&[]);
    let code = ctx.finalize(func_id);
    unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) }
}
  1. call jit comiled function
pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64);
    unsafe {
        res.set_len((a.len() / 2) * 2);
    }
    let buffer = BooleanBuffer::from_iter(res);
    Ok(BooleanArray::new(buffer, nulls))
}

also attch workable version.

sucessful .clif Test Case for F64

  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 }
block0:
  movq    %rdx, %r9
  movq    %r9, %rax
  jmp     label1
block1:
  vmovsd  0(%rdi), %xmm2
  vaddsd  %xmm2, 0(%rsi), %xmm2
  vdivsd  %xmm2, %xmm0, %xmm2
  ucomisd %xmm2, %xmm1
  setnbe  %r10b
  movb    %r10b, 0(%rax)
  lea     1(%rcx), %rcx
  lea     8(%rdi), %rdi
  lea     8(%rsi), %rsi
  lea     1(%rax), %rax
  cmpq    %r8, %rcx
  jl      label2; j label3
block2:
  jmp     label1
block3:
  movq    %r9, %rax
  movq    %rbp, %rsp
  popq    %rbp
  ret

Steps to Reproduce

1) the test cases.

    fn test_jit_expr_on_array_v3_64() {
        let BATCH_SIZE = 64;
        let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.);
        let c = 3.0_f64;
        let d = 3.0_f64;
        let op = jit_expr_v3();
        for _ in 0..100000 {
            let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap();
            let (values, _) = res.into_parts();
        }
    }

pub fn jit_expr_on_array_v3(
    a: &Float64Array,
    b: &Float64Array,
    c: f64,
    d: f64,
    op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64),
) -> Result<BooleanArray, ArrowError> {
    if a.len() != b.len() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }

    if a.is_empty() {
        return Err(ArrowError::ComputeError(
            "Cannot perform binary operation on arrays of different length".to_string(),
        ));
    }
    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());
    let a_ptr = a.values().inner().as_ptr();
    let b_ptr = b.values().inner().as_ptr();
    let mut res: Vec<bool> = Vec::with_capacity(a.len());
    let res_ptr = res.as_ptr();
    op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len())
[message truncated]

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 08:01):

bjorn3 commented on issue #7976:

One problem here is that you use fn(...) instead of extern "C" fn(...) as function signature for the jitted function. The former will use the rust abi, which is unstable. This is likely not the cause of the crash though.

Try turning the ArgumentPurpose::StructReturn argument into a regular argument. ArgumentPurpose::StructReturn may cause the arg to be passed different from a regular arg depending om the calling convention. In addition it needs to be the first argument. In your code you don't need it as you can define how to pass return values yourself. You aren't restricted to matching the abi of an existing C function.

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 09:37):

meijies commented on issue #7976:

@bjorn3 I have changed my code as you suggested, but don't work. Then I found libc pthread invalid next size(fast) usually due to illegal writing to memory,so i changed the result vector capacity and test successfully, but I don't known why?

- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
+ let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+10);

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 09:39):

meijies edited a comment on issue #7976:

@bjorn3 I have changed my code as you suggested, but don't work. Then I found libc pthread invalid next size(fast) usually due to illegal writing to memory,so i changed the result vector capacity and test successfully, but I don't known why writing beyond boundaries

- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
+ let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+10);

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 09:51):

bjorn3 commented on issue #7976:

a.len() / 2 * 2 rounds down. If you meant to round up you need to add 1 before dividing.

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 10:05):

meijies commented on issue #7976:

yes, a.len() / 2 * 2 rounds down may result in one less element being calculated, but shouldn't lead to writing beyond boundary. further more the memory issue occurs randomly and test failure even increase the capacity to (a.len() / 2) * 2+4

- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2);
+ let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+4);

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 10:43):

bjorn3 commented on issue #7976:

Maybe you could try running in valgrind to see if it reports the issue? Valgrind can handle JIT compilation just fine unlike asan and miri.

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 11:36):

meijies commented on issue #7976:

==108775== Memcheck, a memory error detector
==108775== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==108775== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info
==108775== Command: ./target/debug/deps/experiment-56e243a5137b130b
==108775==

running 2 tests
test tests::it_works ... ok
==108775== Thread 2 expr::tests::te:
==108775== Invalid write of size 8
==108775==    at 0x4BE601F: ???
==108775==    by 0x191C2B: experiment::expr::jit_expr_on_array_v3 (expr.rs:314)
==108775==    by 0x192DAB: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==108775==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==108775==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==108775==    by 0x1CCF9E: test::__rust_begin_short_backtrace (function.rs:250)
==108775==    by 0x1CBDF0: test::run_test::{{closure}} (lib.rs:644)
==108775==    by 0x193205: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==108775==    by 0x198246: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529)
==108775==    by 0x7B8E84: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016)
==108775==    by 0x4A09AC2: start_thread (pthread_create.c:442)
==108775==    by 0x4A9AA03: clone (clone.S:100)
==108775==  Address 0x4be5584 is 68 bytes inside a block of size 74 alloc'd
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x5B3287: alloc (alloc.rs:98)
==108775==    by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==108775==    by 0x5B6A98: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241)
==108775==    by 0x2A6D18: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199)
==108775==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145)
==108775==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672)
==108775==    by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481)
==108775==    by 0x191B6A: experiment::expr::jit_expr_on_array_v3 (expr.rs:312)
==108775==    by 0x192DAB: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==108775==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==108775==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==108775==    by 0x1CCF9E: test::__rust_begin_short_backtrace (function.rs:250)
==108775==    by 0x1CBDF0: test::run_test::{{closure}} (lib.rs:644)
==108775==    by 0x193205: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==108775==
test expr::tests::test_jit_expr_on_array_v3_64 ... ok

test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 12.82s

==108775==
==108775== HEAP SUMMARY:
==108775==     in use at exit: 4,209 bytes in 7 blocks
==108775==   total heap usage: 301,154 allocs, 301,147 frees, 19,607,330 bytes allocated
==108775==
==108775== Thread 1:
==108775== 8 bytes in 1 blocks are still reachable in loss record 1 of 7
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x5B3287: alloc (alloc.rs:98)
==108775==    by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==108775==    by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241)
==108775==    by 0x634062: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1259)
==108775==    by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==108775==    by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==108775==    by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==108775==    by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==108775==    by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124)
==108775==    by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208)
==108775==    by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==108775==    by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==108775==
==108775== 8 bytes in 1 blocks are still reachable in loss record 2 of 7
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x5B3287: alloc (alloc.rs:98)
==108775==    by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==108775==    by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241)
==108775==    by 0x634517: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1282)
==108775==    by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==108775==    by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==108775==    by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==108775==    by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==108775==    by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124)
==108775==    by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208)
==108775==    by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==108775==    by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==108775==
==108775== 8 bytes in 1 blocks are still reachable in loss record 3 of 7
==108775==    at 0x484DCD3: realloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x789533: realloc (alloc.rs:136)
==108775==    by 0x789533: alloc::alloc::Global::grow_impl (alloc.rs:213)
==108775==    by 0x789A0F: <alloc::alloc::Global as core::alloc::Allocator>::grow (alloc.rs:266)
==108775==    by 0x780FE9: alloc::raw_vec::finish_grow (raw_vec.rs:518)
==108775==    by 0x2C89E8: alloc::raw_vec::RawVec<T,A>::grow_amortized (raw_vec.rs:433)
==108775==    by 0x2D2568: alloc::raw_vec::RawVec<T,A>::reserve_for_push (raw_vec.rs:318)
==108775==    by 0x28E4A1: alloc::vec::Vec<T,A>::push (mod.rs:1919)
==108775==    by 0x634A67: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1301)
==108775==    by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==108775==    by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==108775==    by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==108775==    by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==108775==
==108775== 9 bytes in 1 blocks are still reachable in loss record 4 of 7
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x5B3287: alloc (alloc.rs:98)
==108775==    by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==108775==    by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241)
==108775==    by 0x633E29: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1246)
==108775==    by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==108775==    by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==108775==    by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==108775==    by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==108775==    by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124)
==108775==    by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208)
==108775==    by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==108775==    by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==108775==
==108775== 16 bytes in 1 blocks are still reachable in loss record 5 of 7
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x1C6623: test::test_main (alloc.rs:98)
==108775==    by 0x1C762C: test::test_main_static (lib.rs:162)
==108775==    by 0x191F82: experiment::main (lib.rs:1)
==108775==    by 0x18F71A: core::ops::function::FnOnce::call_once (function.rs:250)
==108775==    by 0x18FFDD: std::sys_common::backtrace::__rust_begin_short_backtrace (backtrace.rs:155)
==108775==    by 0x18FB00: std::rt::lang_start::{{closure}} (rt.rs:166)
==108775==    by 0x7A8C20: std::rt::lang_start_internal (function.rs:284)
==108775==    by 0x18FAD9: std::rt::lang_start (rt.rs:165)
==108775==    by 0x191FAD: main (in /home/meijie/Work/query-compile-prototype/target/debug/deps/experiment-56e243a5137b130b)
==108775==
==108775== 64 bytes in 1 blocks are still reachable in loss record 6 of 7
==108775==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==108775==    by 0x5B3287: alloc (alloc.rs:98)
==108775==    by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==108775==    by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241)
==108775==    by 0x568527: ahash::random_state::get_fixed_seeds::{{closure}} (boxed.rs:218)
==108775==    by 0x69B689: once_cell::race::once_box::OnceBox<T>
[message truncated]

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 11:54):

meijies edited a comment on issue #7976:

valgrind --tool=memcheck  --leak-check=full ./target/debug/deps/experiment-56e243a5137b130b
==111584== Memcheck, a memory error detector
==111584== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==111584== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info
==111584== Command: ./target/debug/deps/experiment-56e243a5137b130b
==111584==

running 2 tests
test tests::it_works ... ok
==111584== Thread 2 expr::tests::te:
==111584== Invalid write of size 8
==111584==    at 0x4BE601F: ???
==111584==    by 0x191BF6: experiment::expr::jit_expr_on_array_v3 (expr.rs:314)
==111584==    by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==111584==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==111584==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==111584==    by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250)
==111584==    by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644)
==111584==    by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==111584==    by 0x198216: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529)
==111584==    by 0x7B8E54: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016)
==111584==    by 0x4A09AC2: start_thread (pthread_create.c:442)
==111584==    by 0x4A9AA03: clone (clone.S:100)
==111584==  Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd
==111584==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111584==    by 0x5B3257: alloc (alloc.rs:98)
==111584==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111584==    by 0x5B6A68: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241)
==111584==    by 0x2A6CE8: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199)
==111584==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145)
==111584==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672)
==111584==    by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481)
==111584==    by 0x191B33: experiment::expr::jit_expr_on_array_v3 (expr.rs:312)
==111584==    by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==111584==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==111584==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==111584==    by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250)
==111584==    by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644)
==111584==    by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==111584==
test expr::tests::test_jit_expr_on_array_v3_64 ... ok

test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 13.05s

==111584==
==111584== HEAP SUMMARY:
==111584==     in use at exit: 4,209 bytes in 7 blocks
==111584==   total heap usage: 301,154 allocs, 301,147 frees, 18,607,330 bytes allocated
==111584==
==111584== Thread 1:
==111584== 4,096 bytes in 1 blocks are definitely lost in loss record 7 of 7
==111584==    at 0x484DE30: memalign (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111584==    by 0x484DF92: posix_memalign (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111584==    by 0x7B3898: __rdl_alloc (alloc.rs:102)
==111584==    by 0x22FD89: alloc::alloc::alloc (alloc.rs:98)
==111584==    by 0x232500: cranelift_jit::memory::PtrLen::with_size (memory.rs:59)
==111584==    by 0x232A79: cranelift_jit::memory::Memory::allocate (memory.rs:170)
==111584==    by 0x247203: <cranelift_jit::backend::JITModule as cranelift_module::module::Module>::define_function_with_control_plane (backend.rs:697)
==111584==    by 0x1D2DEA: cranelift_module::module::Module::define_function (module.rs:958)
==111584==    by 0x1D17A3: core::gen::ctx::CodegenContext::finalize (ctx.rs:31)
==111584==    by 0x19165C: experiment::expr::jit_expr_v3 (expr.rs:283)
==111584==    by 0x192CA1: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:391)
==111584==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==111584==
==111584== LEAK SUMMARY:
==111584==    definitely lost: 4,096 bytes in 1 blocks
==111584==    indirectly lost: 0 bytes in 0 blocks
==111584==      possibly lost: 0 bytes in 0 blocks
==111584==    still reachable: 113 bytes in 6 blocks
==111584==         suppressed: 0 bytes in 0 blocks
==111584== Reachable blocks (those to which a pointer was found) are not shown.
==111584== To see them, rerun with: --leak-check=full --show-leak-kinds=all
==111584==
==111584== For lists of detected and suppressed errors, rerun with: -s
==111584== ERROR SUMMARY: 1000001 errors from 2 contexts (suppressed: 0 from 0)

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 11:56):

meijies edited a comment on issue #7976:

==111797== Memcheck, a memory error detector
==111797== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==111797== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info
==111797== Command: ./target/debug/deps/experiment-56e243a5137b130b
==111797==

running 2 tests
test tests::it_works ... ok
==111797== Thread 2 expr::tests::te:
==111797== Invalid write of size 8
==111797==    at 0x4BE601F: ???
==111797==    by 0x191BF6: experiment::expr::jit_expr_on_array_v3 (expr.rs:314)
==111797==    by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==111797==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==111797==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==111797==    by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250)
==111797==    by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644)
==111797==    by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==111797==    by 0x198216: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529)
==111797==    by 0x7B8E54: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016)
==111797==    by 0x4A09AC2: start_thread (pthread_create.c:442)
==111797==    by 0x4A9AA03: clone (clone.S:100)
==111797==  Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x5B3257: alloc (alloc.rs:98)
==111797==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111797==    by 0x5B6A68: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241)
==111797==    by 0x2A6CE8: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199)
==111797==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145)
==111797==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672)
==111797==    by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481)
==111797==    by 0x191B33: experiment::expr::jit_expr_on_array_v3 (expr.rs:312)
==111797==    by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393)
==111797==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384)
==111797==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==111797==    by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250)
==111797==    by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644)
==111797==    by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==111797==
test expr::tests::test_jit_expr_on_array_v3_64 ... ok

test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 13.16s

==111797==
==111797== HEAP SUMMARY:
==111797==     in use at exit: 4,209 bytes in 7 blocks
==111797==   total heap usage: 301,154 allocs, 301,147 frees, 18,607,330 bytes allocated
==111797==
==111797== Thread 1:
==111797== 8 bytes in 1 blocks are still reachable in loss record 1 of 7
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x5B3257: alloc (alloc.rs:98)
==111797==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111797==    by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241)
==111797==    by 0x634032: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1259)
==111797==    by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==111797==    by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==111797==    by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==111797==    by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==111797==    by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124)
==111797==    by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208)
==111797==    by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==111797==    by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==111797==
==111797== 8 bytes in 1 blocks are still reachable in loss record 2 of 7
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x5B3257: alloc (alloc.rs:98)
==111797==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111797==    by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241)
==111797==    by 0x6344E7: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1282)
==111797==    by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==111797==    by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==111797==    by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==111797==    by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==111797==    by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124)
==111797==    by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208)
==111797==    by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==111797==    by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==111797==
==111797== 8 bytes in 1 blocks are still reachable in loss record 3 of 7
==111797==    at 0x484DCD3: realloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x789503: realloc (alloc.rs:136)
==111797==    by 0x789503: alloc::alloc::Global::grow_impl (alloc.rs:213)
==111797==    by 0x7899DF: <alloc::alloc::Global as core::alloc::Allocator>::grow (alloc.rs:266)
==111797==    by 0x780FB9: alloc::raw_vec::finish_grow (raw_vec.rs:518)
==111797==    by 0x2C89B8: alloc::raw_vec::RawVec<T,A>::grow_amortized (raw_vec.rs:433)
==111797==    by 0x2D2538: alloc::raw_vec::RawVec<T,A>::reserve_for_push (raw_vec.rs:318)
==111797==    by 0x28E471: alloc::vec::Vec<T,A>::push (mod.rs:1919)
==111797==    by 0x634A37: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1301)
==111797==    by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==111797==    by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==111797==    by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==111797==    by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==111797==
==111797== 9 bytes in 1 blocks are still reachable in loss record 4 of 7
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x5B3257: alloc (alloc.rs:98)
==111797==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111797==    by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241)
==111797==    by 0x633DF9: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1246)
==111797==    by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802)
==111797==    by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250)
==111797==    by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376)
==111797==    by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208)
==111797==    by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124)
==111797==    by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208)
==111797==    by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375)
==111797==    by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298)
==111797==
==111797== 16 bytes in 1 blocks are still reachable in loss record 5 of 7
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x1C65F3: test::test_main (alloc.rs:98)
==111797==    by 0x1C75FC: test::test_main_static (lib.rs:162)
==111797==    by 0x191F52: experiment::main (lib.rs:1)
==111797==    by 0x18F71A: core::ops::function::FnOnce::call_once (function.rs:250)
==111797==    by 0x18FFDD: std::sys_common::backtrace::__rust_begin_short_backtrace (backtrace.rs:155)
==111797==    by 0x18FB00: std::rt::lang_start::{{closure}} (rt.rs:166)
==111797==    by 0x7A8BF0: std::rt::lang_start_internal (function.rs:284)
==111797==    by 0x18FAD9: std::rt::lang_start (rt.rs:165)
==111797==    by 0x191F7D: main (in /home/meijie/Work/query-compile-prototype/target/debug/deps/experiment-56e243a5137b130b)
==111797==
==111797== 64 bytes in 1 blocks are still reachable in loss record 6 of 7
==111797==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==111797==    by 0x5B3257: alloc (alloc.rs:98)
==111797==    by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==111797==    by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241)
==111797==    by 0x5684F7: ahash::random_state::get_fixed_seeds::{{closure}} (boxed.rs:218)
==111797==    by 0x69B659: once_cell::race::once_box::Onc
[message truncated]

view this post on Zulip Wasmtime GitHub notifications bot (Feb 22 2024 at 12:10):

meijies edited a comment on issue #7976:

  query-compile-prototype git:(main)  valgrind --tool=memcheck  --leak-check=full --show-leak-kinds=all ./target/debug/deps/experiment-56e243a5137b130b
==112967== Memcheck, a memory error detector
==112967== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==112967== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info
==112967== Command: ./target/debug/deps/experiment-56e243a5137b130b
==112967==

running 2 tests
test tests::it_works ... ok
==112967== Thread 2 expr::tests::te:
==112967== Invalid write of size 8
==112967==    at 0x4BE601F: ???
==112967==    by 0x191C09: **experiment::expr::jit_expr_on_array_v3 (expr.rs:314)**
==112967==    by 0x192D9B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:394)
==112967==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:385)
==112967==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==112967==    by 0x1CCF8E: test::__rust_begin_short_backtrace (function.rs:250)
==112967==    by 0x1CBDE0: test::run_test::{{closure}} (lib.rs:644)
==112967==    by 0x1931F5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==112967==    by 0x198236: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529)
==112967==    by 0x7B8E74: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016)
==112967==    by 0x4A09AC2: start_thread (pthread_create.c:442)
==112967==    by 0x4A9AA03: clone (clone.S:100)
==112967==  Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd
==112967==    at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==112967==    by 0x5B3277: alloc (alloc.rs:98)
==112967==    by 0x5B3277: alloc::alloc::Global::alloc_impl (alloc.rs:181)
==112967==    by 0x5B6A88: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241)
==112967==    by 0x2A6D08: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199)
==112967==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145)
==112967==    by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672)
==112967==    by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481)
==112967==    by 0x191B4B: experiment::expr::jit_expr_on_array_v3 (expr.rs:312)
==112967==    by 0x192D9B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:394)
==112967==    by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:385)
==112967==    by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250)
==112967==    by 0x1CCF8E: test::__rust_begin_short_backtrace (function.rs:250)
==112967==    by 0x1CBDE0: test::run_test::{{closure}} (lib.rs:644)
==112967==    by 0x1931F5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595)
==112967==
test expr::tests::test_jit_expr_on_array_v3_64 ... ok

test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 6.78s

![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/d84f7348-70f8-43de-afb8-5ffcc0040838)


Last updated: Dec 23 2024 at 12:05 UTC