Hi, everyone.
after compare the performance, i found jit-generated simple expression (3.0 + 4.0) / 3.0 < 4.0
10x slow than hardcode. so my question is:
the create_jit_op
function generate a function that are equivalent to arrow_native_calc
.
fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
return a.add_wrapping(b).div_wrapping(c).lt(&d);
}
fn hardcode_calc() -> bool {
(3.0_f64 + 4.0_f64) / 3.0_f64 < 4.0_f64
}
fn add_benchmark(c: &mut Criterion) {
let op = create_jit_op();
c.bench_function("arrow_native_calc", |b| {
b.iter(|| criterion::black_box(arrow_native_calc(3.0_f64, 4.0_f64, 3.0_f64, 4.0_f64)))
});
c.bench_function("hardcode_calc", |b| {
b.iter(|| criterion::black_box(hardcode_calc()))
});
c.bench_function("jit_calc", |b| {
b.iter(|| criterion::black_box(op(3.0_f64, 4.0_f64)))
});
}
criterion_group!(benches, add_benchmark);
criterion_main!(benches);
the result show jit_calc
10x slower than arrow_native_calc
and hardcode_calc
arrow_native_calc time: [543.67 ps 546.12 ps 549.82 ps]
change: [-0.9606% +0.4079% +2.0981%] (p = 0.68 > 0.05)
No change in performance detected.
Found 5 outliers among 100 measurements (5.00%)
1 (1.00%) low mild
2 (2.00%) high mild
2 (2.00%) high severe
hardcode_calc time: [541.32 ps 542.14 ps 542.98 ps]
change: [-1.1323% -0.7533% -0.4321%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
1 (1.00%) high mild
1 (1.00%) high severe
jit_calc time: [8.4618 ns 8.5140 ns 8.5938 ns]
change: [-98.873% -98.850% -98.831%] (p = 0.00 < 0.05)
Performance has improved.
Found 14 outliers among 100 measurements (14.00%)
4 (4.00%) high mild
10 (10.00%) high severe
create_jit_op
pub fn create_jit_op() -> fn(f64, f64) -> bool {
let mut ctx = CodegenContext::default();
// call_conv is CallConv::Fast,
let mut func_ctx = ctx.create_func_gen_ctx(
"op",
vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
vec![AbiParam::new(types::I8)],
);
let entry_block = func_ctx.builder.create_block();
func_ctx.builder.switch_to_block(entry_block);
func_ctx
.builder
.append_block_params_for_function_params(entry_block);
// call f64 add wrapping on tuple, and call_conv: CallConv::Fast,
let lhs = func_ctx.builder.block_params(entry_block)[0];
let rhs = func_ctx.builder.block_params(entry_block)[1];
let res = func_ctx.call_f64_add_wrapping(lhs, rhs);
// call f64 div wrapping on res and f64 const and call_conv: CallConv::Fast,
let rhs = func_ctx.builder.ins().f64const(3.0);
let res = func_ctx.call_f64_div_wrapping(res, rhs);
// call f64 lt on &res and &f64 const and call_conv: CallConv::Fast,
let slot = func_ctx
.builder
.func
.create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 2));
let res_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 0);
func_ctx
.builder
.ins()
.store(MemFlags::new(), res, res_ref, 0);
let rhs = func_ctx.builder.ins().f64const(4.0);
let rhs_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 1);
func_ctx
.builder
.ins()
.store(MemFlags::new(), rhs, rhs_ref, 0);
let res = func_ctx.call_f64_lt(res_ref, rhs_ref);
let func_id = func_ctx.finalize(res);
let code = ctx.finalize(func_id);
unsafe { mem::transmute::<_, fn(f64, f64) -> bool>(code) }
}
pub fn call_f64_add_wrapping(&mut self, lhs: Value, rhs: Value) -> Value {
let op = NativeOpCall::Float64AddWrapping;
self.call_binary(op, lhs, rhs)
}
fn call_binary(&mut self, op: NativeOpCall, lhs: Value, rhs: Value) -> Value {
let sig = op.signature(self.ptype);
// FIXME this don't generate new func id during every call.
let func_id = self
.module
.declare_function(op.name(), Linkage::Import, &sig)
.unwrap();
let func = self.module.declare_func_in_func(func_id, self.builder.func);
let call = self.builder.ins().call(func, &[lhs, rhs]);
let result = self.builder.inst_results(call);
assert_eq!(result.len(), 1);
result[0]
}
pub (crate) fn signature(&self, pointer_type: Type) -> Signature {
use NativeOpCall::*;
match self {
Float64AddWrapping | Float64DivWrapping => Signature {
params: vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
returns: vec![AbiParam::new(types::F64)],
call_conv: CallConv::Fast,
},
Float64Lt => Signature {
params: vec![AbiParam::new(pointer_type), AbiParam::new(pointer_type)],
returns: vec![AbiParam::new(types::I8)],
call_conv: CallConv::Fast,
},
}
}
pub (crate) fn addr(&self) -> *const u8 {
use NativeOpCall::*;
match self {
Float64AddWrapping => f64::add_wrapping as *const u8,
Float64DivWrapping => f64::div_wrapping as *const u8,
Float64Lt => f64::lt as *const u8,
}
}
fn build_flags() -> settings::Flags {
let mut flag_builder = settings::builder();
flag_builder.set("use_colocated_libcalls", "false").unwrap();
flag_builder.set("is_pic", "false").unwrap();
flag_builder
.set("enable_llvm_abi_extensions", "true")
.unwrap();
flag_builder.set("opt_level", "speed").unwrap();
let flags = settings::Flags::new(flag_builder);
assert!(!flags.use_colocated_libcalls());
assert!(!flags.is_pic());
flags
}
LLVM will optimize hardcode_calc and the arrow_native_calc call to constant values. You need to black_box the constants in the calculation to prevent this. I'm not sure if Cranelift optimizes float operations with known operands to a constant.
@bjorn3 still 5x slow.
Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc time: [1.4768 ns 1.4820 ns 1.4890 ns]
change: [-0.6881% -0.2297% +0.1460%] (p = 0.29 > 0.05)
No change in performance detected.
Found 9 outliers among 100 measurements (9.00%)
1 (1.00%) low severe
1 (1.00%) low mild
2 (2.00%) high mild
5 (5.00%) high severe
hardcode_calc time: [1.8864 ns 1.8950 ns 1.9124 ns]
change: [-0.6743% -0.0642% +0.6151%] (p = 0.86 > 0.05)
No change in performance detected.
Found 14 outliers among 100 measurements (14.00%)
1 (1.00%) low severe
6 (6.00%) high mild
7 (7.00%) high severe
jit_calc time: [7.4061 ns 7.4375 ns 7.4785 ns]
change: [-99.119% -99.081% -99.043%] (p = 0.00 < 0.05)
Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
2 (2.00%) high mild
10 (10.00%) high severe
fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
return a.add_wrapping(b).div_wrapping(c).lt(&d);
}
fn hardcode_calc(a: f64, b: f64, c: f64, d: f64) -> bool {
(a + b) / c < d
}
fn add_benchmark(c: &mut Criterion) {
let op = create_jit_op();
c.bench_function("arrow_native_calc", |b| {
b.iter(|| {
arrow_native_calc(
black_box(3.0_f64),
black_box(4.0_f64),
black_box(3.0_f64),
black_box(4.0_f64),
)
})
});
c.bench_function("hardcode_calc", |b| {
b.iter(|| {
criterion::black_box(hardcode_calc(
black_box(3.0_f64),
black_box(4.0_f64),
black_box(3.0_f64),
black_box(4.0_f64),
))
})
});
c.bench_function("jit_calc", |b| {
b.iter(|| op(black_box(3.0_f64), black_box(4.0_f64)))
});
}
criterion_group!(benches, add_benchmark);
criterion_main!(benches);
Could you try replacing hardcode_calc(...)
with criterion::black_box(hardcode_calc as fn(f64, f64, f64, f64) -> f64)(...)
to avoid inlining the hardcode_calc
function itself?
Also it looks like you are calling an external function for every float add. The rust version would inline this too.
bjorn3 said:
Could you try replacing
hardcode_calc(...)
withcriterion::black_box(hardcode_calc as fn(f64, f64, f64, f64) -> f64)(...)
to avoid inlining thehardcode_calc
function itself?
yes, all of those add_wrapping, div_wrapping, lt function is inline function. but seem jit-generated function do not inline.
Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc time: [1.4803 ns 1.4847 ns 1.4919 ns]
change: [+0.2458% +0.9349% +2.0967%] (p = 0.03 < 0.05)
Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
2 (2.00%) low severe
1 (1.00%) high mild
3 (3.00%) high severe
hardcode_calc time: [2.0588 ns 2.0776 ns 2.1101 ns]
change: [+38.601% +39.948% +41.544%] (p = 0.00 < 0.05)
Performance has regressed.
Found 17 outliers among 100 measurements (17.00%)
1 (1.00%) low severe
3 (3.00%) low mild
4 (4.00%) high mild
9 (9.00%) high severe
jit_calc time: [5.4364 ns 5.4583 ns 5.4957 ns]
change: [-99.014% -99.003% -98.994%] (p = 0.00 < 0.05)
Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
2 (2.00%) high severe
create_jit_op
function from test debug." pushq %rbp
unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
movq %rsp, %rbp
unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 16 }
subq %rsp, $16, %rsp
movq %r13, 0(%rsp)
unwind SaveReg { clobber_offset: 0, reg: p13i }
block0:
movq %rdi, %r13
load_ext_name userextname0+0, %rax
call *%rax
movabsq $4613937818241073152, %rdi
vmovq %rdi, %xmm1
load_ext_name userextname1+0, %rax
call *%rax
load_ext_name userextname2+0, %rax
movq %r13, %rdi
call *%rax
movq 0(%rsp), %r13
addq %rsp, $16, %rsp
movq %rbp, %rsp
popq %rbp
ret
"
but seem jit-generated function do not inline.
Correct. There is no way to inline between LLVM and Cranelift generated functions, just like there is no way to inline between LLVM and GCC generated functions.
Jeremy Mei(梅杰) has marked this topic as resolved.
Last updated: Jan 24 2025 at 00:11 UTC