meijies added the cranelift label to Issue #7976.
meijies added the bug label to Issue #7976.
meijies opened issue #7976:
Summary
I am try to compile and execute a function which execute expression
(a + b) / c < d
on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found.failure
.clif
Test Case for F64X2pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %rax vpshufd $68, %xmm0, %xmm3 vpshufd $68, %xmm1, %xmm2 jmp label1 block1: vmovupd 0(%rdi), %xmm4 vaddpd %xmm4, 0(%rsi), %xmm4 vdivpd %xmm4, %xmm3, %xmm4 vcmppd $1, %xmm4, %xmm2, %xmm4 vmovdqu %xmm4, 0(%rdx) lea 1(%rcx), %rcx lea 16(%rdi), %rdi lea 16(%rsi), %rsi lea 2(%rdx), %rdx cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
- the test cases.
#[test] fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; // jit compiled function. let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)
- jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) { let mut ctx = CodegenContext::builder().debug().finish(); let data_type = types::F64X2; let result_type = types::I8X2; let mut func_ctx = ctx.create_func_gen_ctx( "op_v3", vec![ AbiParam::new(ctx.ptype()), AbiParam::new(ctx.ptype()), AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn), AbiParam::new(types::F64), AbiParam::new(types::F64), AbiParam::new(types::I64), AbiParam::new(types::I64), ], vec![], ); let entry_block = func_ctx.builder.create_block(); let body_block = func_ctx.builder.create_block(); let exit_block = func_ctx.builder.create_block(); func_ctx.builder.switch_to_block(entry_block); func_ctx .builder .append_block_params_for_function_params(entry_block); let p1 = func_ctx.builder.block_params(entry_block)[0]; let p2 = func_ctx.builder.block_params(entry_block)[1]; let p3 = func_ctx.builder.block_params(entry_block)[2]; let p4 = func_ctx.builder.block_params(entry_block)[3]; let p5 = func_ctx.builder.block_params(entry_block)[4]; let p6 = func_ctx.builder.block_params(entry_block)[5]; let p7 = func_ctx.builder.block_params(entry_block)[6]; let p4 = func_ctx.builder.ins().splat(data_type, p4); let p5 = func_ctx.builder.ins().splat(data_type, p5); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx .builder .ins() .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]); func_ctx.builder.seal_block(entry_block); func_ctx.builder.switch_to_block(body_block); let lhs_ref = func_ctx.builder.block_params(body_block)[0]; let rhs_ref = func_ctx.builder.block_params(body_block)[1]; let result_ref = func_ctx.builder.block_params(body_block)[2]; let to_div = func_ctx.builder.block_params(body_block)[3]; let to_lt = func_ctx.builder.block_params(body_block)[4]; let start = func_ctx.builder.block_params(body_block)[5]; let end = func_ctx.builder.block_params(body_block)[6]; let lhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), lhs_ref, 0); let rhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), rhs_ref, 0); let sum = func_ctx.builder.ins().fadd(lhs, rhs); let div_result = func_ctx.builder.ins().fdiv(sum, to_div); let result = func_ctx .builder .ins() .fcmp(FloatCC::LessThan, div_result, to_lt); func_ctx .builder .ins() .store(MemFlags::new(), result, result_ref, 0); let offset = func_ctx .builder .ins() .iconst(types::I64, data_type.bytes() as i64); let result_offset = func_ctx .builder .ins() .iconst(types::I64, result_type.bytes() as i64); let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref); let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref); let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref); let next_start = func_ctx.builder.ins().iadd_imm(start, 1); let cond = func_ctx .builder .ins() .icmp(IntCC::SignedLessThan, next_start, end); func_ctx.builder.ins().brif( cond, body_block, &[ next_lhs_ref, next_rhs_ref, next_result_ref, to_div, to_lt, next_start, end, ], exit_block, &[], ); func_ctx.builder.switch_to_block(exit_block); let func_id = func_ctx.finalize(&[]); let code = ctx.finalize(func_id); unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) } }
- call jit comiled function
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64); unsafe { res.set_len((a.len() / 2) * 2); } let buffer = BooleanBuffer::from_iter(res); Ok(BooleanArray::new(buffer, nulls)) }
sucessful
.clif
Test Case for F64pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %r9 movq %r9, %rax jmp label1 block1: vmovsd 0(%rdi), %xmm2 vaddsd %xmm2, 0(%rsi), %xmm2 vdivsd %xmm2, %xmm0, %xmm2 ucomisd %xmm2, %xmm1 setnbe %r10b movb %r10b, 0(%rax) lea 1(%rcx), %rcx lea 8(%rdi), %rdi lea 8(%rsi), %rsi lea 1(%rax), %rax cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %r9, %rax movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
1) the test cases.
fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity(a.len()); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len()) as i64); unsafe { res.set_len(a.len()); } let buffer = BooleanBuffer::from_iter(res); Ok(Boole [message truncated]
meijies edited issue #7976:
Summary
I am try to compile and execute a function which execute expression
(a + b) / c < d
on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found.failure
.clif
Test Case for F64X2pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %rax vpshufd $68, %xmm0, %xmm3 vpshufd $68, %xmm1, %xmm2 jmp label1 block1: vmovupd 0(%rdi), %xmm4 vaddpd %xmm4, 0(%rsi), %xmm4 vdivpd %xmm4, %xmm3, %xmm4 vcmppd $1, %xmm4, %xmm2, %xmm4 vmovdqu %xmm4, 0(%rdx) lea 1(%rcx), %rcx lea 16(%rdi), %rdi lea 16(%rsi), %rsi lea 2(%rdx), %rdx cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
- the test cases.
#[test] fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; // jit compiled function. let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)
- jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) { let mut ctx = CodegenContext::builder().debug().finish(); let data_type = types::F64X2; let result_type = types::I8X2; let mut func_ctx = ctx.create_func_gen_ctx( "op_v3", vec![ AbiParam::new(ctx.ptype()), AbiParam::new(ctx.ptype()), AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn), AbiParam::new(types::F64), AbiParam::new(types::F64), AbiParam::new(types::I64), AbiParam::new(types::I64), ], vec![], ); let entry_block = func_ctx.builder.create_block(); let body_block = func_ctx.builder.create_block(); let exit_block = func_ctx.builder.create_block(); func_ctx.builder.switch_to_block(entry_block); func_ctx .builder .append_block_params_for_function_params(entry_block); let p1 = func_ctx.builder.block_params(entry_block)[0]; let p2 = func_ctx.builder.block_params(entry_block)[1]; let p3 = func_ctx.builder.block_params(entry_block)[2]; let p4 = func_ctx.builder.block_params(entry_block)[3]; let p5 = func_ctx.builder.block_params(entry_block)[4]; let p6 = func_ctx.builder.block_params(entry_block)[5]; let p7 = func_ctx.builder.block_params(entry_block)[6]; let p4 = func_ctx.builder.ins().splat(data_type, p4); let p5 = func_ctx.builder.ins().splat(data_type, p5); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx .builder .ins() .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]); func_ctx.builder.seal_block(entry_block); func_ctx.builder.switch_to_block(body_block); let lhs_ref = func_ctx.builder.block_params(body_block)[0]; let rhs_ref = func_ctx.builder.block_params(body_block)[1]; let result_ref = func_ctx.builder.block_params(body_block)[2]; let to_div = func_ctx.builder.block_params(body_block)[3]; let to_lt = func_ctx.builder.block_params(body_block)[4]; let start = func_ctx.builder.block_params(body_block)[5]; let end = func_ctx.builder.block_params(body_block)[6]; let lhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), lhs_ref, 0); let rhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), rhs_ref, 0); let sum = func_ctx.builder.ins().fadd(lhs, rhs); let div_result = func_ctx.builder.ins().fdiv(sum, to_div); let result = func_ctx .builder .ins() .fcmp(FloatCC::LessThan, div_result, to_lt); func_ctx .builder .ins() .store(MemFlags::new(), result, result_ref, 0); let offset = func_ctx .builder .ins() .iconst(types::I64, data_type.bytes() as i64); let result_offset = func_ctx .builder .ins() .iconst(types::I64, result_type.bytes() as i64); let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref); let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref); let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref); let next_start = func_ctx.builder.ins().iadd_imm(start, 1); let cond = func_ctx .builder .ins() .icmp(IntCC::SignedLessThan, next_start, end); func_ctx.builder.ins().brif( cond, body_block, &[ next_lhs_ref, next_rhs_ref, next_result_ref, to_div, to_lt, next_start, end, ], exit_block, &[], ); func_ctx.builder.switch_to_block(exit_block); let func_id = func_ctx.finalize(&[]); let code = ctx.finalize(func_id); unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) } }
- call jit comiled function
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64); unsafe { res.set_len((a.len() / 2) * 2); } let buffer = BooleanBuffer::from_iter(res); Ok(BooleanArray::new(buffer, nulls)) }
also attch workable version.
sucessful
.clif
Test Case for F64pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %r9 movq %r9, %rax jmp label1 block1: vmovsd 0(%rdi), %xmm2 vaddsd %xmm2, 0(%rsi), %xmm2 vdivsd %xmm2, %xmm0, %xmm2 ucomisd %xmm2, %xmm1 setnbe %r10b movb %r10b, 0(%rax) lea 1(%rcx), %rcx lea 8(%rdi), %rdi lea 8(%rsi), %rsi lea 1(%rax), %rax cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %r9, %rax movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
1) the test cases.
fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity(a.len()); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len()) as i64); unsafe { res.set_len(a.len()); } let buffer = Boole [message truncated]
meijies edited issue #7976:
Summary
I am try to compile and execute a function which execute expression
(a + b) / c < d
on each element in the arrays. firstly, I generate the IR of this function with F64 type, that works fine. then I convert the function to F64X2 variant to see if there are any performance improvements, but unexpected memory problem occurs. My first intuition is that the problem lies in the function call section, but nothing i can found. so the biggest problem is that I don't know how to troubleshoot this type of issue.failure
.clif
Test Case for F64X2pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %rax vpshufd $68, %xmm0, %xmm3 vpshufd $68, %xmm1, %xmm2 jmp label1 block1: vmovupd 0(%rdi), %xmm4 vaddpd %xmm4, 0(%rsi), %xmm4 vdivpd %xmm4, %xmm3, %xmm4 vcmppd $1, %xmm4, %xmm2, %xmm4 vmovdqu %xmm4, 0(%rdx) lea 1(%rcx), %rcx lea 16(%rdi), %rdi lea 16(%rsi), %rsi lea 2(%rdx), %rdx cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
- the test cases.
#[test] fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; // jit compiled function. let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
The problem lies in the third iteration,blew is the debug screenshot
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/3a2d97c6-b616-4902-8fca-7b6730d9e26d)
- jit compiled function.
pub fn jit_expr_v3() -> fn(*const u8, *const u8, *const bool, f64, f64, i64, i64) { let mut ctx = CodegenContext::builder().debug().finish(); let data_type = types::F64X2; let result_type = types::I8X2; let mut func_ctx = ctx.create_func_gen_ctx( "op_v3", vec![ AbiParam::new(ctx.ptype()), AbiParam::new(ctx.ptype()), AbiParam::special(ctx.ptype(), ArgumentPurpose::StructReturn), AbiParam::new(types::F64), AbiParam::new(types::F64), AbiParam::new(types::I64), AbiParam::new(types::I64), ], vec![], ); let entry_block = func_ctx.builder.create_block(); let body_block = func_ctx.builder.create_block(); let exit_block = func_ctx.builder.create_block(); func_ctx.builder.switch_to_block(entry_block); func_ctx .builder .append_block_params_for_function_params(entry_block); let p1 = func_ctx.builder.block_params(entry_block)[0]; let p2 = func_ctx.builder.block_params(entry_block)[1]; let p3 = func_ctx.builder.block_params(entry_block)[2]; let p4 = func_ctx.builder.block_params(entry_block)[3]; let p5 = func_ctx.builder.block_params(entry_block)[4]; let p6 = func_ctx.builder.block_params(entry_block)[5]; let p7 = func_ctx.builder.block_params(entry_block)[6]; let p4 = func_ctx.builder.ins().splat(data_type, p4); let p5 = func_ctx.builder.ins().splat(data_type, p5); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx .builder .append_block_param(body_block, func_ctx.ptype); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, data_type); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx.builder.append_block_param(body_block, types::I64); func_ctx .builder .ins() .jump(body_block, &[p1, p2, p3, p4, p5, p6, p7]); func_ctx.builder.seal_block(entry_block); func_ctx.builder.switch_to_block(body_block); let lhs_ref = func_ctx.builder.block_params(body_block)[0]; let rhs_ref = func_ctx.builder.block_params(body_block)[1]; let result_ref = func_ctx.builder.block_params(body_block)[2]; let to_div = func_ctx.builder.block_params(body_block)[3]; let to_lt = func_ctx.builder.block_params(body_block)[4]; let start = func_ctx.builder.block_params(body_block)[5]; let end = func_ctx.builder.block_params(body_block)[6]; let lhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), lhs_ref, 0); let rhs = func_ctx .builder .ins() .load(data_type, MemFlags::new(), rhs_ref, 0); let sum = func_ctx.builder.ins().fadd(lhs, rhs); let div_result = func_ctx.builder.ins().fdiv(sum, to_div); let result = func_ctx .builder .ins() .fcmp(FloatCC::LessThan, div_result, to_lt); func_ctx .builder .ins() .store(MemFlags::new(), result, result_ref, 0); let offset = func_ctx .builder .ins() .iconst(types::I64, data_type.bytes() as i64); let result_offset = func_ctx .builder .ins() .iconst(types::I64, result_type.bytes() as i64); let next_lhs_ref = func_ctx.builder.ins().iadd(offset, lhs_ref); let next_rhs_ref = func_ctx.builder.ins().iadd(offset, rhs_ref); let next_result_ref = func_ctx.builder.ins().iadd(result_offset, result_ref); let next_start = func_ctx.builder.ins().iadd_imm(start, 1); let cond = func_ctx .builder .ins() .icmp(IntCC::SignedLessThan, next_start, end); func_ctx.builder.ins().brif( cond, body_block, &[ next_lhs_ref, next_rhs_ref, next_result_ref, to_div, to_lt, next_start, end, ], exit_block, &[], ); func_ctx.builder.switch_to_block(exit_block); let func_id = func_ctx.finalize(&[]); let code = ctx.finalize(func_id); unsafe { mem::transmute::<_, fn(*const u8, *const u8, *const bool, f64, f64, i64, i64)>(code) } }
- call jit comiled function
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len() / 2) as i64); unsafe { res.set_len((a.len() / 2) * 2); } let buffer = BooleanBuffer::from_iter(res); Ok(BooleanArray::new(buffer, nulls)) }
also attch workable version.
sucessful
.clif
Test Case for F64pushq %rbp unwind PushFrameRegs { offset_upward_to_caller_sp: 16 } movq %rsp, %rbp unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 0 } block0: movq %rdx, %r9 movq %r9, %rax jmp label1 block1: vmovsd 0(%rdi), %xmm2 vaddsd %xmm2, 0(%rsi), %xmm2 vdivsd %xmm2, %xmm0, %xmm2 ucomisd %xmm2, %xmm1 setnbe %r10b movb %r10b, 0(%rax) lea 1(%rcx), %rcx lea 8(%rdi), %rdi lea 8(%rsi), %rsi lea 1(%rax), %rax cmpq %r8, %rcx jl label2; j label3 block2: jmp label1 block3: movq %r9, %rax movq %rbp, %rsp popq %rbp ret
Steps to Reproduce
1) the test cases.
fn test_jit_expr_on_array_v3_64() { let BATCH_SIZE = 64; let a = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let b = create_primitive_array::<Float64Type>(BATCH_SIZE, 0.); let c = 3.0_f64; let d = 3.0_f64; let op = jit_expr_v3(); for _ in 0..100000 { let res = jit_expr_on_array_v3(&a, &b, c, d, op).unwrap(); let (values, _) = res.into_parts(); } }
pub fn jit_expr_on_array_v3( a: &Float64Array, b: &Float64Array, c: f64, d: f64, op: fn(*const u8, *const u8, *const bool, f64, f64, i64, i64), ) -> Result<BooleanArray, ArrowError> { if a.len() != b.len() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } if a.is_empty() { return Err(ArrowError::ComputeError( "Cannot perform binary operation on arrays of different length".to_string(), )); } let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()); let a_ptr = a.values().inner().as_ptr(); let b_ptr = b.values().inner().as_ptr(); let mut res: Vec<bool> = Vec::with_capacity(a.len()); let res_ptr = res.as_ptr(); op(a_ptr, b_ptr, res_ptr, c, d, 0, (a.len()) [message truncated]
bjorn3 commented on issue #7976:
One problem here is that you use
fn(...)
instead ofextern "C" fn(...)
as function signature for the jitted function. The former will use the rust abi, which is unstable. This is likely not the cause of the crash though.Try turning the
ArgumentPurpose::StructReturn
argument into a regular argument.ArgumentPurpose::StructReturn
may cause the arg to be passed different from a regular arg depending om the calling convention. In addition it needs to be the first argument. In your code you don't need it as you can define how to pass return values yourself. You aren't restricted to matching the abi of an existing C function.
meijies commented on issue #7976:
@bjorn3 I have changed my code as you suggested, but don't work. Then I found libc pthread invalid next size(fast) usually due to illegal writing to memory,so i changed the result vector capacity and test successfully, but I don't known why?
- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); + let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+10);
meijies edited a comment on issue #7976:
@bjorn3 I have changed my code as you suggested, but don't work. Then I found libc pthread invalid next size(fast) usually due to illegal writing to memory,so i changed the result vector capacity and test successfully, but I don't known why writing beyond boundaries
- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); + let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+10);
bjorn3 commented on issue #7976:
a.len() / 2 * 2 rounds down. If you meant to round up you need to add 1 before dividing.
meijies commented on issue #7976:
yes, a.len() / 2 * 2 rounds down may result in one less element being calculated, but shouldn't lead to writing beyond boundary. further more the memory issue occurs randomly and test failure even increase the capacity to (a.len() / 2) * 2+4
- let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2); + let mut res: Vec<bool> = Vec::with_capacity((a.len() / 2) * 2+4);
bjorn3 commented on issue #7976:
Maybe you could try running in valgrind to see if it reports the issue? Valgrind can handle JIT compilation just fine unlike asan and miri.
meijies commented on issue #7976:
==108775== Memcheck, a memory error detector ==108775== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==108775== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info ==108775== Command: ./target/debug/deps/experiment-56e243a5137b130b ==108775== running 2 tests test tests::it_works ... ok ==108775== Thread 2 expr::tests::te: ==108775== Invalid write of size 8 ==108775== at 0x4BE601F: ??? ==108775== by 0x191C2B: experiment::expr::jit_expr_on_array_v3 (expr.rs:314) ==108775== by 0x192DAB: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==108775== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==108775== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==108775== by 0x1CCF9E: test::__rust_begin_short_backtrace (function.rs:250) ==108775== by 0x1CBDF0: test::run_test::{{closure}} (lib.rs:644) ==108775== by 0x193205: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==108775== by 0x198246: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529) ==108775== by 0x7B8E84: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016) ==108775== by 0x4A09AC2: start_thread (pthread_create.c:442) ==108775== by 0x4A9AA03: clone (clone.S:100) ==108775== Address 0x4be5584 is 68 bytes inside a block of size 74 alloc'd ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x5B3287: alloc (alloc.rs:98) ==108775== by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==108775== by 0x5B6A98: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241) ==108775== by 0x2A6D18: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199) ==108775== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145) ==108775== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672) ==108775== by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481) ==108775== by 0x191B6A: experiment::expr::jit_expr_on_array_v3 (expr.rs:312) ==108775== by 0x192DAB: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==108775== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==108775== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==108775== by 0x1CCF9E: test::__rust_begin_short_backtrace (function.rs:250) ==108775== by 0x1CBDF0: test::run_test::{{closure}} (lib.rs:644) ==108775== by 0x193205: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==108775== test expr::tests::test_jit_expr_on_array_v3_64 ... ok test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 12.82s ==108775== ==108775== HEAP SUMMARY: ==108775== in use at exit: 4,209 bytes in 7 blocks ==108775== total heap usage: 301,154 allocs, 301,147 frees, 19,607,330 bytes allocated ==108775== ==108775== Thread 1: ==108775== 8 bytes in 1 blocks are still reachable in loss record 1 of 7 ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x5B3287: alloc (alloc.rs:98) ==108775== by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==108775== by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241) ==108775== by 0x634062: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1259) ==108775== by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==108775== by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==108775== by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==108775== by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==108775== by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124) ==108775== by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208) ==108775== by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==108775== by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==108775== ==108775== 8 bytes in 1 blocks are still reachable in loss record 2 of 7 ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x5B3287: alloc (alloc.rs:98) ==108775== by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==108775== by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241) ==108775== by 0x634517: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1282) ==108775== by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==108775== by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==108775== by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==108775== by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==108775== by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124) ==108775== by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208) ==108775== by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==108775== by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==108775== ==108775== 8 bytes in 1 blocks are still reachable in loss record 3 of 7 ==108775== at 0x484DCD3: realloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x789533: realloc (alloc.rs:136) ==108775== by 0x789533: alloc::alloc::Global::grow_impl (alloc.rs:213) ==108775== by 0x789A0F: <alloc::alloc::Global as core::alloc::Allocator>::grow (alloc.rs:266) ==108775== by 0x780FE9: alloc::raw_vec::finish_grow (raw_vec.rs:518) ==108775== by 0x2C89E8: alloc::raw_vec::RawVec<T,A>::grow_amortized (raw_vec.rs:433) ==108775== by 0x2D2568: alloc::raw_vec::RawVec<T,A>::reserve_for_push (raw_vec.rs:318) ==108775== by 0x28E4A1: alloc::vec::Vec<T,A>::push (mod.rs:1919) ==108775== by 0x634A67: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1301) ==108775== by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==108775== by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==108775== by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==108775== by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==108775== ==108775== 9 bytes in 1 blocks are still reachable in loss record 4 of 7 ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x5B3287: alloc (alloc.rs:98) ==108775== by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==108775== by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241) ==108775== by 0x633E29: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1246) ==108775== by 0x632C42: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==108775== by 0x5C8499: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==108775== by 0x5C8352: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==108775== by 0x623482: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==108775== by 0x5271C9: std::sys_common::once::futex::Once::call (futex.rs:124) ==108775== by 0x6232EF: std::sync::once::Once::call_once_force (once.rs:208) ==108775== by 0x5C81BC: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==108775== by 0x5C8682: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==108775== ==108775== 16 bytes in 1 blocks are still reachable in loss record 5 of 7 ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x1C6623: test::test_main (alloc.rs:98) ==108775== by 0x1C762C: test::test_main_static (lib.rs:162) ==108775== by 0x191F82: experiment::main (lib.rs:1) ==108775== by 0x18F71A: core::ops::function::FnOnce::call_once (function.rs:250) ==108775== by 0x18FFDD: std::sys_common::backtrace::__rust_begin_short_backtrace (backtrace.rs:155) ==108775== by 0x18FB00: std::rt::lang_start::{{closure}} (rt.rs:166) ==108775== by 0x7A8C20: std::rt::lang_start_internal (function.rs:284) ==108775== by 0x18FAD9: std::rt::lang_start (rt.rs:165) ==108775== by 0x191FAD: main (in /home/meijie/Work/query-compile-prototype/target/debug/deps/experiment-56e243a5137b130b) ==108775== ==108775== 64 bytes in 1 blocks are still reachable in loss record 6 of 7 ==108775== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==108775== by 0x5B3287: alloc (alloc.rs:98) ==108775== by 0x5B3287: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==108775== by 0x5B2F7A: alloc::alloc::exchange_malloc (alloc.rs:241) ==108775== by 0x568527: ahash::random_state::get_fixed_seeds::{{closure}} (boxed.rs:218) ==108775== by 0x69B689: once_cell::race::once_box::OnceBox<T> [message truncated]
meijies edited a comment on issue #7976:
valgrind --tool=memcheck --leak-check=full ./target/debug/deps/experiment-56e243a5137b130b ==111584== Memcheck, a memory error detector ==111584== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==111584== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info ==111584== Command: ./target/debug/deps/experiment-56e243a5137b130b ==111584== running 2 tests test tests::it_works ... ok ==111584== Thread 2 expr::tests::te: ==111584== Invalid write of size 8 ==111584== at 0x4BE601F: ??? ==111584== by 0x191BF6: experiment::expr::jit_expr_on_array_v3 (expr.rs:314) ==111584== by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==111584== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==111584== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==111584== by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250) ==111584== by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644) ==111584== by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==111584== by 0x198216: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529) ==111584== by 0x7B8E54: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016) ==111584== by 0x4A09AC2: start_thread (pthread_create.c:442) ==111584== by 0x4A9AA03: clone (clone.S:100) ==111584== Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd ==111584== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111584== by 0x5B3257: alloc (alloc.rs:98) ==111584== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111584== by 0x5B6A68: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241) ==111584== by 0x2A6CE8: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199) ==111584== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145) ==111584== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672) ==111584== by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481) ==111584== by 0x191B33: experiment::expr::jit_expr_on_array_v3 (expr.rs:312) ==111584== by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==111584== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==111584== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==111584== by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250) ==111584== by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644) ==111584== by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==111584== test expr::tests::test_jit_expr_on_array_v3_64 ... ok test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 13.05s ==111584== ==111584== HEAP SUMMARY: ==111584== in use at exit: 4,209 bytes in 7 blocks ==111584== total heap usage: 301,154 allocs, 301,147 frees, 18,607,330 bytes allocated ==111584== ==111584== Thread 1: ==111584== 4,096 bytes in 1 blocks are definitely lost in loss record 7 of 7 ==111584== at 0x484DE30: memalign (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111584== by 0x484DF92: posix_memalign (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111584== by 0x7B3898: __rdl_alloc (alloc.rs:102) ==111584== by 0x22FD89: alloc::alloc::alloc (alloc.rs:98) ==111584== by 0x232500: cranelift_jit::memory::PtrLen::with_size (memory.rs:59) ==111584== by 0x232A79: cranelift_jit::memory::Memory::allocate (memory.rs:170) ==111584== by 0x247203: <cranelift_jit::backend::JITModule as cranelift_module::module::Module>::define_function_with_control_plane (backend.rs:697) ==111584== by 0x1D2DEA: cranelift_module::module::Module::define_function (module.rs:958) ==111584== by 0x1D17A3: core::gen::ctx::CodegenContext::finalize (ctx.rs:31) ==111584== by 0x19165C: experiment::expr::jit_expr_v3 (expr.rs:283) ==111584== by 0x192CA1: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:391) ==111584== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==111584== ==111584== LEAK SUMMARY: ==111584== definitely lost: 4,096 bytes in 1 blocks ==111584== indirectly lost: 0 bytes in 0 blocks ==111584== possibly lost: 0 bytes in 0 blocks ==111584== still reachable: 113 bytes in 6 blocks ==111584== suppressed: 0 bytes in 0 blocks ==111584== Reachable blocks (those to which a pointer was found) are not shown. ==111584== To see them, rerun with: --leak-check=full --show-leak-kinds=all ==111584== ==111584== For lists of detected and suppressed errors, rerun with: -s ==111584== ERROR SUMMARY: 1000001 errors from 2 contexts (suppressed: 0 from 0)
meijies edited a comment on issue #7976:
==111797== Memcheck, a memory error detector ==111797== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==111797== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info ==111797== Command: ./target/debug/deps/experiment-56e243a5137b130b ==111797== running 2 tests test tests::it_works ... ok ==111797== Thread 2 expr::tests::te: ==111797== Invalid write of size 8 ==111797== at 0x4BE601F: ??? ==111797== by 0x191BF6: experiment::expr::jit_expr_on_array_v3 (expr.rs:314) ==111797== by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==111797== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==111797== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==111797== by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250) ==111797== by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644) ==111797== by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==111797== by 0x198216: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529) ==111797== by 0x7B8E54: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016) ==111797== by 0x4A09AC2: start_thread (pthread_create.c:442) ==111797== by 0x4A9AA03: clone (clone.S:100) ==111797== Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x5B3257: alloc (alloc.rs:98) ==111797== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111797== by 0x5B6A68: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241) ==111797== by 0x2A6CE8: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199) ==111797== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145) ==111797== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672) ==111797== by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481) ==111797== by 0x191B33: experiment::expr::jit_expr_on_array_v3 (expr.rs:312) ==111797== by 0x192D7B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:393) ==111797== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:384) ==111797== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==111797== by 0x1CCF6E: test::__rust_begin_short_backtrace (function.rs:250) ==111797== by 0x1CBDC0: test::run_test::{{closure}} (lib.rs:644) ==111797== by 0x1931D5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==111797== test expr::tests::test_jit_expr_on_array_v3_64 ... ok test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 13.16s ==111797== ==111797== HEAP SUMMARY: ==111797== in use at exit: 4,209 bytes in 7 blocks ==111797== total heap usage: 301,154 allocs, 301,147 frees, 18,607,330 bytes allocated ==111797== ==111797== Thread 1: ==111797== 8 bytes in 1 blocks are still reachable in loss record 1 of 7 ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x5B3257: alloc (alloc.rs:98) ==111797== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111797== by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241) ==111797== by 0x634032: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1259) ==111797== by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==111797== by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==111797== by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==111797== by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==111797== by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124) ==111797== by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208) ==111797== by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==111797== by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==111797== ==111797== 8 bytes in 1 blocks are still reachable in loss record 2 of 7 ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x5B3257: alloc (alloc.rs:98) ==111797== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111797== by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241) ==111797== by 0x6344E7: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1282) ==111797== by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==111797== by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==111797== by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==111797== by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==111797== by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124) ==111797== by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208) ==111797== by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==111797== by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==111797== ==111797== 8 bytes in 1 blocks are still reachable in loss record 3 of 7 ==111797== at 0x484DCD3: realloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x789503: realloc (alloc.rs:136) ==111797== by 0x789503: alloc::alloc::Global::grow_impl (alloc.rs:213) ==111797== by 0x7899DF: <alloc::alloc::Global as core::alloc::Allocator>::grow (alloc.rs:266) ==111797== by 0x780FB9: alloc::raw_vec::finish_grow (raw_vec.rs:518) ==111797== by 0x2C89B8: alloc::raw_vec::RawVec<T,A>::grow_amortized (raw_vec.rs:433) ==111797== by 0x2D2538: alloc::raw_vec::RawVec<T,A>::reserve_for_push (raw_vec.rs:318) ==111797== by 0x28E471: alloc::vec::Vec<T,A>::push (mod.rs:1919) ==111797== by 0x634A37: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1301) ==111797== by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==111797== by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==111797== by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==111797== by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==111797== ==111797== 9 bytes in 1 blocks are still reachable in loss record 4 of 7 ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x5B3257: alloc (alloc.rs:98) ==111797== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111797== by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241) ==111797== by 0x633DF9: cranelift_codegen::isa::x64::abi::create_reg_env_systemv (abi.rs:1246) ==111797== by 0x632C12: <cranelift_codegen::isa::x64::abi::X64ABIMachineSpec as cranelift_codegen::machinst::abi::ABIMachineSpec>::get_machine_env::{{closure}} (abi.rs:802) ==111797== by 0x5C8469: std::sync::once_lock::OnceLock<T>::get_or_init::{{closure}} (once_lock.rs:250) ==111797== by 0x5C8322: std::sync::once_lock::OnceLock<T>::initialize::{{closure}} (once_lock.rs:376) ==111797== by 0x623452: std::sync::once::Once::call_once_force::{{closure}} (once.rs:208) ==111797== by 0x527199: std::sys_common::once::futex::Once::call (futex.rs:124) ==111797== by 0x6232BF: std::sync::once::Once::call_once_force (once.rs:208) ==111797== by 0x5C818C: std::sync::once_lock::OnceLock<T>::initialize (once_lock.rs:375) ==111797== by 0x5C8652: std::sync::once_lock::OnceLock<T>::get_or_try_init (once_lock.rs:298) ==111797== ==111797== 16 bytes in 1 blocks are still reachable in loss record 5 of 7 ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x1C65F3: test::test_main (alloc.rs:98) ==111797== by 0x1C75FC: test::test_main_static (lib.rs:162) ==111797== by 0x191F52: experiment::main (lib.rs:1) ==111797== by 0x18F71A: core::ops::function::FnOnce::call_once (function.rs:250) ==111797== by 0x18FFDD: std::sys_common::backtrace::__rust_begin_short_backtrace (backtrace.rs:155) ==111797== by 0x18FB00: std::rt::lang_start::{{closure}} (rt.rs:166) ==111797== by 0x7A8BF0: std::rt::lang_start_internal (function.rs:284) ==111797== by 0x18FAD9: std::rt::lang_start (rt.rs:165) ==111797== by 0x191F7D: main (in /home/meijie/Work/query-compile-prototype/target/debug/deps/experiment-56e243a5137b130b) ==111797== ==111797== 64 bytes in 1 blocks are still reachable in loss record 6 of 7 ==111797== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==111797== by 0x5B3257: alloc (alloc.rs:98) ==111797== by 0x5B3257: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==111797== by 0x5B2F4A: alloc::alloc::exchange_malloc (alloc.rs:241) ==111797== by 0x5684F7: ahash::random_state::get_fixed_seeds::{{closure}} (boxed.rs:218) ==111797== by 0x69B659: once_cell::race::once_box::Onc [message truncated]
meijies edited a comment on issue #7976:
➜ query-compile-prototype git:(main) ✗ valgrind --tool=memcheck --leak-check=full --show-leak-kinds=all ./target/debug/deps/experiment-56e243a5137b130b ==112967== Memcheck, a memory error detector ==112967== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al. ==112967== Using Valgrind-3.18.1 and LibVEX; rerun with -h for copyright info ==112967== Command: ./target/debug/deps/experiment-56e243a5137b130b ==112967== running 2 tests test tests::it_works ... ok ==112967== Thread 2 expr::tests::te: ==112967== Invalid write of size 8 ==112967== at 0x4BE601F: ??? ==112967== by 0x191C09: **experiment::expr::jit_expr_on_array_v3 (expr.rs:314)** ==112967== by 0x192D9B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:394) ==112967== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:385) ==112967== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==112967== by 0x1CCF8E: test::__rust_begin_short_backtrace (function.rs:250) ==112967== by 0x1CBDE0: test::run_test::{{closure}} (lib.rs:644) ==112967== by 0x1931F5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==112967== by 0x198236: core::ops::function::FnOnce::call_once{{vtable.shim}} (mod.rs:529) ==112967== by 0x7B8E74: std::sys::pal::unix::thread::Thread::new::thread_start (boxed.rs:2016) ==112967== by 0x4A09AC2: start_thread (pthread_create.c:442) ==112967== by 0x4A9AA03: clone (clone.S:100) ==112967== Address 0x4be557a is 58 bytes inside a block of size 64 alloc'd ==112967== at 0x4848899: malloc (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so) ==112967== by 0x5B3277: alloc (alloc.rs:98) ==112967== by 0x5B3277: alloc::alloc::Global::alloc_impl (alloc.rs:181) ==112967== by 0x5B6A88: <alloc::alloc::Global as core::alloc::Allocator>::allocate (alloc.rs:241) ==112967== by 0x2A6D08: alloc::raw_vec::RawVec<T,A>::allocate_in (raw_vec.rs:199) ==112967== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (raw_vec.rs:145) ==112967== by 0x18FDA5: with_capacity_in<bool, alloc::alloc::Global> (mod.rs:672) ==112967== by 0x18FDA5: alloc::vec::Vec<T>::with_capacity (mod.rs:481) ==112967== by 0x191B4B: experiment::expr::jit_expr_on_array_v3 (expr.rs:312) ==112967== by 0x192D9B: experiment::expr::tests::test_jit_expr_on_array_v3_64 (expr.rs:394) ==112967== by 0x18F8E6: experiment::expr::tests::test_jit_expr_on_array_v3_64::{{closure}} (expr.rs:385) ==112967== by 0x18F6A5: core::ops::function::FnOnce::call_once (function.rs:250) ==112967== by 0x1CCF8E: test::__rust_begin_short_backtrace (function.rs:250) ==112967== by 0x1CBDE0: test::run_test::{{closure}} (lib.rs:644) ==112967== by 0x1931F5: std::sys_common::backtrace::__rust_begin_short_backtrace (lib.rs:595) ==112967== test expr::tests::test_jit_expr_on_array_v3_64 ... ok test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 6.78s
![image](https://github.com/bytecodealliance/wasmtime/assets/13784260/d84f7348-70f8-43de-afb8-5ffcc0040838)
Last updated: Jan 24 2025 at 00:11 UTC