✔ jit-generated simple expression 10x slow than hardcode · cranelift

after compare the performance, i found jit-generated simple expression (3.0 + 4.0) / 3.0 < 4.0 10x slow than hardcode. so my question is:

the bench

the create_jit_opfunction generate a function that are equivalent to arrow_native_calc.

fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
    return a.add_wrapping(b).div_wrapping(c).lt(&d);
}

fn hardcode_calc() -> bool {
    (3.0_f64 + 4.0_f64) / 3.0_f64 < 4.0_f64
}

fn add_benchmark(c: &mut Criterion) {
    let op = create_jit_op();
    c.bench_function("arrow_native_calc", |b| {
        b.iter(|| criterion::black_box(arrow_native_calc(3.0_f64, 4.0_f64, 3.0_f64, 4.0_f64)))
    });
    c.bench_function("hardcode_calc", |b| {
        b.iter(|| criterion::black_box(hardcode_calc()))
    });
    c.bench_function("jit_calc", |b| {
        b.iter(|| criterion::black_box(op(3.0_f64, 4.0_f64)))
    });
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

bench result

the result show jit_calc 10x slower than arrow_native_calc and hardcode_calc

arrow_native_calc       time:   [543.67 ps 546.12 ps 549.82 ps]
                        change: [-0.9606% +0.4079% +2.0981%] (p = 0.68 > 0.05)
                        No change in performance detected.
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low mild
  2 (2.00%) high mild
  2 (2.00%) high severe

hardcode_calc           time:   [541.32 ps 542.14 ps 542.98 ps]
                        change: [-1.1323% -0.7533% -0.4321%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) high mild
  1 (1.00%) high severe

jit_calc                time:   [8.4618 ns 8.5140 ns 8.5938 ns]
                        change: [-98.873% -98.850% -98.831%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 14 outliers among 100 measurements (14.00%)
  4 (4.00%) high mild
  10 (10.00%) high severe

the detail of create_jit_op

pub fn create_jit_op() -> fn(f64, f64) -> bool {
    let mut ctx = CodegenContext::default();
    // call_conv is CallConv::Fast,
    let mut func_ctx = ctx.create_func_gen_ctx(
        "op",
        vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
        vec![AbiParam::new(types::I8)],
    );

    let entry_block = func_ctx.builder.create_block();
    func_ctx.builder.switch_to_block(entry_block);
    func_ctx
        .builder
        .append_block_params_for_function_params(entry_block);

    // call f64 add wrapping on tuple, and call_conv: CallConv::Fast,
    let lhs = func_ctx.builder.block_params(entry_block)[0];
    let rhs = func_ctx.builder.block_params(entry_block)[1];
    let res = func_ctx.call_f64_add_wrapping(lhs, rhs);

    // call f64 div wrapping on res and f64 const and call_conv: CallConv::Fast,
    let rhs = func_ctx.builder.ins().f64const(3.0);
    let res = func_ctx.call_f64_div_wrapping(res, rhs);

    // call f64 lt on &res and &f64 const and call_conv: CallConv::Fast,
    let slot = func_ctx
        .builder
        .func
        .create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 2));

    let res_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 0);
    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), res, res_ref, 0);

    let rhs = func_ctx.builder.ins().f64const(4.0);
    let rhs_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 1);
    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), rhs, rhs_ref, 0);

    let res = func_ctx.call_f64_lt(res_ref, rhs_ref);
    let func_id = func_ctx.finalize(res);
    let code = ctx.finalize(func_id);
    unsafe { mem::transmute::<_, fn(f64, f64) -> bool>(code) }
}

pub fn call_f64_add_wrapping(&mut self, lhs: Value, rhs: Value) -> Value {
        let op = NativeOpCall::Float64AddWrapping;
        self.call_binary(op, lhs, rhs)
}

fn call_binary(&mut self, op: NativeOpCall, lhs: Value, rhs: Value) -> Value {
        let sig = op.signature(self.ptype);
        // FIXME this don't generate new func id during every call.
        let func_id = self
            .module
            .declare_function(op.name(), Linkage::Import, &sig)
            .unwrap();
        let func = self.module.declare_func_in_func(func_id, self.builder.func);
        let call = self.builder.ins().call(func, &[lhs, rhs]);
        let result = self.builder.inst_results(call);
        assert_eq!(result.len(), 1);
        result[0]
}

pub (crate) fn signature(&self, pointer_type: Type) -> Signature {
        use NativeOpCall::*;
        match self {
            Float64AddWrapping | Float64DivWrapping => Signature {
                params: vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
                returns: vec![AbiParam::new(types::F64)],
                call_conv: CallConv::Fast,
            },
            Float64Lt => Signature {
                params: vec![AbiParam::new(pointer_type), AbiParam::new(pointer_type)],
                returns: vec![AbiParam::new(types::I8)],
                call_conv: CallConv::Fast,
            },
        }
    }

pub (crate) fn addr(&self) -> *const u8 {
        use NativeOpCall::*;
        match self {
            Float64AddWrapping => f64::add_wrapping as *const u8,
            Float64DivWrapping => f64::div_wrapping as *const u8,
            Float64Lt => f64::lt as *const u8,
        }
    }

fn build_flags() -> settings::Flags {
    let mut flag_builder = settings::builder();
    flag_builder.set("use_colocated_libcalls", "false").unwrap();
    flag_builder.set("is_pic", "false").unwrap();
    flag_builder
        .set("enable_llvm_abi_extensions", "true")
        .unwrap();
    flag_builder.set("opt_level", "speed").unwrap();
    let flags = settings::Flags::new(flag_builder);
    assert!(!flags.use_colocated_libcalls());
    assert!(!flags.is_pic());
    flags
}

bjorn3 (Feb 18 2024 at 10:27):

LLVM will optimize hardcode_calc and the arrow_native_calc call to constant values. You need to black_box the constants in the calculation to prevent this. I'm not sure if Cranelift optimizes float operations with known operands to a constant.

Jeremy Mei(梅杰) (Feb 18 2024 at 10:57):

bench result.

Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc       time:   [1.4768 ns 1.4820 ns 1.4890 ns]
                        change: [-0.6881% -0.2297% +0.1460%] (p = 0.29 > 0.05)
                        No change in performance detected.
Found 9 outliers among 100 measurements (9.00%)
  1 (1.00%) low severe
  1 (1.00%) low mild
  2 (2.00%) high mild
  5 (5.00%) high severe

hardcode_calc           time:   [1.8864 ns 1.8950 ns 1.9124 ns]
                        change: [-0.6743% -0.0642% +0.6151%] (p = 0.86 > 0.05)
                        No change in performance detected.
Found 14 outliers among 100 measurements (14.00%)
  1 (1.00%) low severe
  6 (6.00%) high mild
  7 (7.00%) high severe

jit_calc                time:   [7.4061 ns 7.4375 ns 7.4785 ns]
                        change: [-99.119% -99.081% -99.043%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
  2 (2.00%) high mild
  10 (10.00%) high severe

the bench code.

fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
    return a.add_wrapping(b).div_wrapping(c).lt(&d);
}

fn hardcode_calc(a: f64, b: f64, c: f64, d: f64) -> bool {
    (a + b) / c < d
}

fn add_benchmark(c: &mut Criterion) {
    let op = create_jit_op();
    c.bench_function("arrow_native_calc", |b| {
        b.iter(|| {
            arrow_native_calc(
                black_box(3.0_f64),
                black_box(4.0_f64),
                black_box(3.0_f64),
                black_box(4.0_f64),
            )
        })
    });
    c.bench_function("hardcode_calc", |b| {
        b.iter(|| {
            criterion::black_box(hardcode_calc(
                black_box(3.0_f64),
                black_box(4.0_f64),
                black_box(3.0_f64),
                black_box(4.0_f64),
            ))
        })
    });
    c.bench_function("jit_calc", |b| {
        b.iter(|| op(black_box(3.0_f64), black_box(4.0_f64)))
    });
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

bjorn3 (Feb 18 2024 at 12:23):

Could you try replacing hardcode_calc(...) with criterion::black_box(hardcode_calc as fn(f64, f64, f64, f64) -> f64)(...) to avoid inlining the hardcode_calc function itself?

bjorn3 (Feb 18 2024 at 12:24):

Also it looks like you are calling an external function for every float add. The rust version would inline this too.

Jeremy Mei(梅杰) (Feb 18 2024 at 12:57):

yes, all of those add_wrapping, div_wrapping, lt function is inline function. but seem jit-generated function do not inline.

    Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc       time:   [1.4803 ns 1.4847 ns 1.4919 ns]
                        change: [+0.2458% +0.9349% +2.0967%] (p = 0.03 < 0.05)
                        Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
  2 (2.00%) low severe
  1 (1.00%) high mild
  3 (3.00%) high severe

hardcode_calc           time:   [2.0588 ns 2.0776 ns 2.1101 ns]
                        change: [+38.601% +39.948% +41.544%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 17 outliers among 100 measurements (17.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  4 (4.00%) high mild
  9 (9.00%) high severe

jit_calc                time:   [5.4364 ns 5.4583 ns 5.4957 ns]
                        change: [-99.014% -99.003% -98.994%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high severe

"  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 16 }
  subq    %rsp, $16, %rsp
  movq    %r13, 0(%rsp)
  unwind SaveReg { clobber_offset: 0, reg: p13i }
block0:
  movq    %rdi, %r13
  load_ext_name userextname0+0, %rax
  call    *%rax
  movabsq $4613937818241073152, %rdi
  vmovq   %rdi, %xmm1
  load_ext_name userextname1+0, %rax
  call    *%rax
  load_ext_name userextname2+0, %rax
  movq    %r13, %rdi
  call    *%rax
  movq    0(%rsp), %r13
  addq    %rsp, $16, %rsp
  movq    %rbp, %rsp
  popq    %rbp
  ret
"

bjorn3 (Feb 18 2024 at 13:00):

Correct. There is no way to inline between LLVM and Cranelift generated functions, just like there is no way to inline between LLVM and GCC generated functions.

Stream: cranelift

Topic: ✔ jit-generated simple expression 10x slow than hardcode

Jeremy Mei(梅杰) (Feb 18 2024 at 07:51):

summary

the bench

bench result

the detail of `create_jit_op`

bjorn3 (Feb 18 2024 at 10:27):

Jeremy Mei(梅杰) (Feb 18 2024 at 10:57):

bench result.

the bench code.

bjorn3 (Feb 18 2024 at 12:23):

bjorn3 (Feb 18 2024 at 12:24):

Jeremy Mei(梅杰) (Feb 18 2024 at 12:57):

bjorn3 (Feb 18 2024 at 13:00):

Notification Bot (Feb 18 2024 at 13:02):

Stream: cranelift

Topic: ✔ jit-generated simple expression 10x slow than hardcode

Jeremy Mei(梅杰) (Feb 18 2024 at 07:51):

summary

the bench

bench result

the detail of create_jit_op

bjorn3 (Feb 18 2024 at 10:27):

Jeremy Mei(梅杰) (Feb 18 2024 at 10:57):

bench result.

the bench code.

bjorn3 (Feb 18 2024 at 12:23):

bjorn3 (Feb 18 2024 at 12:24):

Jeremy Mei(梅杰) (Feb 18 2024 at 12:57):

bjorn3 (Feb 18 2024 at 13:00):

Notification Bot (Feb 18 2024 at 13:02):

the detail of `create_jit_op`