Stream: cranelift

Topic: ✔ jit-generated simple expression 10x slow than hardcode


view this post on Zulip Jeremy Mei(梅杰) (Feb 18 2024 at 07:51):

Hi, everyone.

summary

after compare the performance, i found jit-generated simple expression (3.0 + 4.0) / 3.0 < 4.0 10x slow than hardcode. so my question is:

  1. Is this as expected?
  2. If not,how i can improved the performance?

the bench

the create_jit_opfunction generate a function that are equivalent to arrow_native_calc.

fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
    return a.add_wrapping(b).div_wrapping(c).lt(&d);
}

fn hardcode_calc() -> bool {
    (3.0_f64 + 4.0_f64) / 3.0_f64 < 4.0_f64
}

fn add_benchmark(c: &mut Criterion) {
    let op = create_jit_op();
    c.bench_function("arrow_native_calc", |b| {
        b.iter(|| criterion::black_box(arrow_native_calc(3.0_f64, 4.0_f64, 3.0_f64, 4.0_f64)))
    });
    c.bench_function("hardcode_calc", |b| {
        b.iter(|| criterion::black_box(hardcode_calc()))
    });
    c.bench_function("jit_calc", |b| {
        b.iter(|| criterion::black_box(op(3.0_f64, 4.0_f64)))
    });
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

bench result

the result show jit_calc 10x slower than arrow_native_calc and hardcode_calc

arrow_native_calc       time:   [543.67 ps 546.12 ps 549.82 ps]
                        change: [-0.9606% +0.4079% +2.0981%] (p = 0.68 > 0.05)
                        No change in performance detected.
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low mild
  2 (2.00%) high mild
  2 (2.00%) high severe

hardcode_calc           time:   [541.32 ps 542.14 ps 542.98 ps]
                        change: [-1.1323% -0.7533% -0.4321%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) high mild
  1 (1.00%) high severe

jit_calc                time:   [8.4618 ns 8.5140 ns 8.5938 ns]
                        change: [-98.873% -98.850% -98.831%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 14 outliers among 100 measurements (14.00%)
  4 (4.00%) high mild
  10 (10.00%) high severe

the detail of create_jit_op

  1. how function generated
pub fn create_jit_op() -> fn(f64, f64) -> bool {
    let mut ctx = CodegenContext::default();
    // call_conv is CallConv::Fast,
    let mut func_ctx = ctx.create_func_gen_ctx(
        "op",
        vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
        vec![AbiParam::new(types::I8)],
    );

    let entry_block = func_ctx.builder.create_block();
    func_ctx.builder.switch_to_block(entry_block);
    func_ctx
        .builder
        .append_block_params_for_function_params(entry_block);

    // call f64 add wrapping on tuple, and call_conv: CallConv::Fast,
    let lhs = func_ctx.builder.block_params(entry_block)[0];
    let rhs = func_ctx.builder.block_params(entry_block)[1];
    let res = func_ctx.call_f64_add_wrapping(lhs, rhs);

    // call f64 div wrapping on res and f64 const and call_conv: CallConv::Fast,
    let rhs = func_ctx.builder.ins().f64const(3.0);
    let res = func_ctx.call_f64_div_wrapping(res, rhs);

    // call f64 lt on &res and &f64 const and call_conv: CallConv::Fast,
    let slot = func_ctx
        .builder
        .func
        .create_sized_stack_slot(StackSlotData::new(StackSlotKind::ExplicitSlot, 2));

    let res_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 0);
    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), res, res_ref, 0);

    let rhs = func_ctx.builder.ins().f64const(4.0);
    let rhs_ref = func_ctx.builder.ins().stack_addr(func_ctx.ptype, slot, 1);
    func_ctx
        .builder
        .ins()
        .store(MemFlags::new(), rhs, rhs_ref, 0);

    let res = func_ctx.call_f64_lt(res_ref, rhs_ref);
    let func_id = func_ctx.finalize(res);
    let code = ctx.finalize(func_id);
    unsafe { mem::transmute::<_, fn(f64, f64) -> bool>(code) }
}
  1. call imported function.
pub fn call_f64_add_wrapping(&mut self, lhs: Value, rhs: Value) -> Value {
        let op = NativeOpCall::Float64AddWrapping;
        self.call_binary(op, lhs, rhs)
}

fn call_binary(&mut self, op: NativeOpCall, lhs: Value, rhs: Value) -> Value {
        let sig = op.signature(self.ptype);
        // FIXME this don't generate new func id during every call.
        let func_id = self
            .module
            .declare_function(op.name(), Linkage::Import, &sig)
            .unwrap();
        let func = self.module.declare_func_in_func(func_id, self.builder.func);
        let call = self.builder.ins().call(func, &[lhs, rhs]);
        let result = self.builder.inst_results(call);
        assert_eq!(result.len(), 1);
        result[0]
}

pub (crate) fn signature(&self, pointer_type: Type) -> Signature {
        use NativeOpCall::*;
        match self {
            Float64AddWrapping | Float64DivWrapping => Signature {
                params: vec![AbiParam::new(types::F64), AbiParam::new(types::F64)],
                returns: vec![AbiParam::new(types::F64)],
                call_conv: CallConv::Fast,
            },
            Float64Lt => Signature {
                params: vec![AbiParam::new(pointer_type), AbiParam::new(pointer_type)],
                returns: vec![AbiParam::new(types::I8)],
                call_conv: CallConv::Fast,
            },
        }
    }

pub (crate) fn addr(&self) -> *const u8 {
        use NativeOpCall::*;
        match self {
            Float64AddWrapping => f64::add_wrapping as *const u8,
            Float64DivWrapping => f64::div_wrapping as *const u8,
            Float64Lt => f64::lt as *const u8,
        }
    }
  1. flag settings
fn build_flags() -> settings::Flags {
    let mut flag_builder = settings::builder();
    flag_builder.set("use_colocated_libcalls", "false").unwrap();
    flag_builder.set("is_pic", "false").unwrap();
    flag_builder
        .set("enable_llvm_abi_extensions", "true")
        .unwrap();
    flag_builder.set("opt_level", "speed").unwrap();
    let flags = settings::Flags::new(flag_builder);
    assert!(!flags.use_colocated_libcalls());
    assert!(!flags.is_pic());
    flags
}

view this post on Zulip bjorn3 (Feb 18 2024 at 10:27):

LLVM will optimize hardcode_calc and the arrow_native_calc call to constant values. You need to black_box the constants in the calculation to prevent this. I'm not sure if Cranelift optimizes float operations with known operands to a constant.

view this post on Zulip Jeremy Mei(梅杰) (Feb 18 2024 at 10:57):

@bjorn3 still 5x slow.

bench result.

Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc       time:   [1.4768 ns 1.4820 ns 1.4890 ns]
                        change: [-0.6881% -0.2297% +0.1460%] (p = 0.29 > 0.05)
                        No change in performance detected.
Found 9 outliers among 100 measurements (9.00%)
  1 (1.00%) low severe
  1 (1.00%) low mild
  2 (2.00%) high mild
  5 (5.00%) high severe

hardcode_calc           time:   [1.8864 ns 1.8950 ns 1.9124 ns]
                        change: [-0.6743% -0.0642% +0.6151%] (p = 0.86 > 0.05)
                        No change in performance detected.
Found 14 outliers among 100 measurements (14.00%)
  1 (1.00%) low severe
  6 (6.00%) high mild
  7 (7.00%) high severe

jit_calc                time:   [7.4061 ns 7.4375 ns 7.4785 ns]
                        change: [-99.119% -99.081% -99.043%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
  2 (2.00%) high mild
  10 (10.00%) high severe

the bench code.

fn arrow_native_calc<T: ArrowNativeTypeOp>(a: T, b: T, c: T, d: T) -> bool {
    return a.add_wrapping(b).div_wrapping(c).lt(&d);
}

fn hardcode_calc(a: f64, b: f64, c: f64, d: f64) -> bool {
    (a + b) / c < d
}

fn add_benchmark(c: &mut Criterion) {
    let op = create_jit_op();
    c.bench_function("arrow_native_calc", |b| {
        b.iter(|| {
            arrow_native_calc(
                black_box(3.0_f64),
                black_box(4.0_f64),
                black_box(3.0_f64),
                black_box(4.0_f64),
            )
        })
    });
    c.bench_function("hardcode_calc", |b| {
        b.iter(|| {
            criterion::black_box(hardcode_calc(
                black_box(3.0_f64),
                black_box(4.0_f64),
                black_box(3.0_f64),
                black_box(4.0_f64),
            ))
        })
    });
    c.bench_function("jit_calc", |b| {
        b.iter(|| op(black_box(3.0_f64), black_box(4.0_f64)))
    });
}

criterion_group!(benches, add_benchmark);
criterion_main!(benches);

view this post on Zulip bjorn3 (Feb 18 2024 at 12:23):

Could you try replacing hardcode_calc(...) with criterion::black_box(hardcode_calc as fn(f64, f64, f64, f64) -> f64)(...) to avoid inlining the hardcode_calc function itself?

view this post on Zulip bjorn3 (Feb 18 2024 at 12:24):

Also it looks like you are calling an external function for every float add. The rust version would inline this too.

view this post on Zulip Jeremy Mei(梅杰) (Feb 18 2024 at 12:57):

bjorn3 said:

Could you try replacing hardcode_calc(...) with criterion::black_box(hardcode_calc as fn(f64, f64, f64, f64) -> f64)(...) to avoid inlining the hardcode_calc function itself?

yes, all of those add_wrapping, div_wrapping, lt function is inline function. but seem jit-generated function do not inline.

  1. the new ben result after this changed:
    Running benches/scalar_calc.rs (target/release/deps/scalar_calc-a5b82b9d208b7531)
arrow_native_calc       time:   [1.4803 ns 1.4847 ns 1.4919 ns]
                        change: [+0.2458% +0.9349% +2.0967%] (p = 0.03 < 0.05)
                        Change within noise threshold.
Found 6 outliers among 100 measurements (6.00%)
  2 (2.00%) low severe
  1 (1.00%) high mild
  3 (3.00%) high severe

hardcode_calc           time:   [2.0588 ns 2.0776 ns 2.1101 ns]
                        change: [+38.601% +39.948% +41.544%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 17 outliers among 100 measurements (17.00%)
  1 (1.00%) low severe
  3 (3.00%) low mild
  4 (4.00%) high mild
  9 (9.00%) high severe

jit_calc                time:   [5.4364 ns 5.4583 ns 5.4957 ns]
                        change: [-99.014% -99.003% -98.994%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high severe
  1. disasm of create_jit_op function from test debug.
"  pushq   %rbp
  unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
  movq    %rsp, %rbp
  unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 16 }
  subq    %rsp, $16, %rsp
  movq    %r13, 0(%rsp)
  unwind SaveReg { clobber_offset: 0, reg: p13i }
block0:
  movq    %rdi, %r13
  load_ext_name userextname0+0, %rax
  call    *%rax
  movabsq $4613937818241073152, %rdi
  vmovq   %rdi, %xmm1
  load_ext_name userextname1+0, %rax
  call    *%rax
  load_ext_name userextname2+0, %rax
  movq    %r13, %rdi
  call    *%rax
  movq    0(%rsp), %r13
  addq    %rsp, $16, %rsp
  movq    %rbp, %rsp
  popq    %rbp
  ret
"

view this post on Zulip bjorn3 (Feb 18 2024 at 13:00):

but seem jit-generated function do not inline.

Correct. There is no way to inline between LLVM and Cranelift generated functions, just like there is no way to inline between LLVM and GCC generated functions.

view this post on Zulip Notification Bot (Feb 18 2024 at 13:02):

Jeremy Mei(梅杰) has marked this topic as resolved.


Last updated: Dec 23 2024 at 13:07 UTC