alexcrichton opened issue #6159:
I did a bit of work recently to get a build of XNNPACK working with Wasmtime. My original motivation for doing this was that XNNPACK has been used as a benchmark to evaluate a number of changes to the simd proposal to WebAssembly (and relaxed-simd), so it seemed like a great candidate to test as baselines between WebAssembly engines and native performance. The
master
branch of that repository itself doesn't support WASI as a target, so I hacked up a bunch of commits and updated the intrinsics to the LLVM 16 versions. Using that I was able to produce a WebAssembly file with:./bazelisk-linux-arm64 build -c opt --config wasi --features wasm_relaxed_simd :end2end_bench
The output file is end2end.simd.relaxed.wasm.gz.
I'll be honest in that there's a ton of benchmarks listed in the
BUILD.bazel
file for that repository and I don't know if "end2end" is the "main" one or if there even is a "main" one. Nevertheless it sounded right and I figured it was at least one good starting point.Using this script:
<details>
<summary><code>run.sh</code></summary>
set -e wasmtime="../wasmtime/target/release/wasmtime run --disable-cache \ --wasm-features threads,relaxed-simd \ --wasi-modules experimental-wasi-threads \ --" wasmtime2="../wasmtime/target/release/wasmtime run --disable-cache \ --wasm-features threads,relaxed-simd \ --wasi-modules experimental-wasi-threads \ --cranelift-set use_egraphs=false \ --" node="node \ --experimental-wasm-relaxed-simd \ --experimental-wasi-unstable-preview1 \ ../meshoptimizer/run.mjs" spidermonkey="../gecko-dev/obj-opt-aarch64-unknown-linux-gnu/dist/bin/js \ ../wasmtime/spidermonkey.js" v8="../v8/out/arm64.release/d8 \ --experimental-wasm-relaxed-simd \ ../wasmtime/v8.js --" for t in `$wasmtime ./end2end.simd.relaxed.wasm --benchmark_list_tests | rg -v FP16`; do # echo ==================================================================== for file in end2end.simd.relaxed.wasm; do # echo == $file == echo -n "wasmtime " $wasmtime ./$file --benchmark_filter=$t 2>&1 | rg $t echo -n "wasmtime2 " $wasmtime2 ./$file --benchmark_filter=$t 2>&1 | rg $t echo -n "node " $node ./$file --benchmark_filter=$t 2>&1 | rg $t echo -n "spidermonkey " $spidermonkey ./$file --benchmark_filter=$t 2>&1 | rg $t echo -n "v8 " $v8 ./$file --benchmark_filter=$t 2>&1 | rg $t echo -n "native " ./end2end_bench --benchmark_filter=$t 2>&1 | rg $t done echo done
</details>
<details>
<summary><code>run.mjs</code></summary>
import { readFile } from 'node:fs/promises'; import { WASI } from 'wasi'; import { argv } from 'node:process'; const wasi = new WASI({ version: 'preview1', args: argv.slice(2), }); const m = process.argv[2]; const wasm = await WebAssembly.compile(await readFile(m)); const obj = wasi.getImportObject(); obj.env = { memory: new WebAssembly.Memory({shared: true, initial: 8000, maximum: 16000}), }; obj.wasi = { 'thread-spawn': function() { throw new Error('wasi thread spawn'); }, }; const instance = await WebAssembly.instantiate(wasm, obj); wasi.start(instance);
</details>
<details>
<summary><code>spidermonkey.js</code></summary>const ITERS = 1; //////////////////////////////////////////////////////////////////////////////// class TextEncoder { constructor(enc) { if (enc != "utf-8") { throw new Error("FITZGEN: unsupported encoding: " + enc); } } encode(s, n) { let buf = new Uint8Array(s.length); for (let i = 0; i < Math.min(s.length, n); i++) { buf[i] = s.charCodeAt(i); // lol } return buf; } }; class TextDecoder { constructor(enc) { if (enc != "utf-8") { throw new Error("FITZGEN: unsupported encoding: " + enc); } } decode(buf) { let buf8 = new Uint8Array(buf); let s = ""; for (let i = 0; i < buf8.length; i++) { s += String.fromCharCode(buf8[i]); // lol } return s; } }; //////////////////////////////////////////////////////////////////////////////// async function ionCompile(wasm) { let module = await WebAssembly.compile(wasm); while (!wasmHasTier2CompilationCompleted(module)) { sleep(1); } // wasmDis(module); return module; } function unimplemented(name) { return function (...args) { throw new Error(name + " is unimplemented! args = " + args); } } class ProcExitError extends Error { constructor(code) { this.code = code; this.message = "Program exited with code " + code; } toString() { return "ProcExitError: " + this.message } } function trace(obj) { let proxy = {}; for (let key of Object.keys(obj)) { if (typeof obj[key] != "function") { proxy[key] = obj[key]; continue; } proxy[key] = function (...args) { print("TRACE: " + key + "(" + args + ")"); let ret = obj[key](...args); print("TRACE: -> " + ret); return ret; }; } return proxy; } //////////////////////////////////////////////////////////////////////////////// async function main() { let smWasm = os.file.readFile(scriptArgs[0], "binary"); let smModule = await ionCompile(smWasm); for (let i = 0; i < ITERS; i++) { let args = scriptArgs; let mem = null; let start = null; let unread = true; let smInstance = await WebAssembly.instantiate(smModule, trace({ env: { memory: new WebAssembly.Memory({ shared: true, minimum: 8000, maximum: 16000, }), }, bench: { start: function () { start = monotonicNow() }, end: function () { let end = monotonicNow(); print("ITER: " + (end - start)); } }, wasi: { 'thread-spawn': function() { throw new Error('thread-spawn'); }, }, wasi_snapshot_preview1: { fd_pread: function() { throw new Error('fd_pread'); }, poll_oneoff: function(in_, out, n) { console.log(n); // throw new Error('poll_oneoff'); return 0; }, random_get: function(a, b) { while (b > 0) { (new DataView(mem.buffer)).setInt8(a, 1, true); b -= 1; a += 1; } return 0; }, args_get: function(a, b) { for (let arg of args) { (new DataView(mem.buffer)).setInt32(a, b, true); a += 4; for (let c in arg) { (new DataView(mem.buffer)).setInt8(b, arg.charCodeAt(c), true); b += 1; } (new DataView(mem.buffer)).setInt8(b, 0, true); b += 1; } return 0; }, args_sizes_get: function(a, b) { let len = 0; for (let arg of args) { len += arg.length + 1; } (new DataView(mem.buffer)).setInt32(a, args.length, true); (new DataView(mem.buffer)).setInt32(b, len, true); return 0; }, clock_res_get: function () { return 1; }, clock_time_get: function(a, b, c) { const now = Math.round(performance.now() * 1000000); (new DataView(mem.buffer)).setBigInt64(c, BigInt(now), true); return 0; }, fd_filestat_get: function() { throw new Error('fd_filestat_get'); }, fd_read: function(fd, iovecs_ptr, iovecs_len, out_ptr) { let mem8 = new Uint8Array(mem.buffer); switch (fd) { case 4: let data = os.file.readFile("/home/nick/sightglass/benchmarks/spidermonkey/default.input.md"); let k = 0; for (let i = 0; i < iovecs_len && k < data.length && unread; i++) { let ptr = (new DataView(mem.buffer)).getUint32(iovecs_ptr + i * 8, true); let len = (new DataView(mem.buffer)).getUint32(iovecs_ptr + i * 8 + 4, true); for (let j = 0; j < len && k < data.length; j++) { mem8[ptr + j] = data.charCodeAt(k++); } } unread = false; (new DataView(mem.buffer)).setUint32(out_ptr, k, true); return 0; default: return 8; } }, fd_seek: function(fd, offset, whence, out_ptr) { switch (fd) { case 4: let len = os.file.readFile("/home/nick/sightglass/benchmarks/spidermonkey/default.input.md").length; (new DataView(mem.buffer)).setBigUint64(out_ptr, BigInt(len), true); return 0; default: return 8; } }, fd_write: function(a, b, c, d) { let s = ''; let total = 0; while (c > 0) { let base = (new DataView(mem.buffer)).getInt32(b, true); let len = (new DataView(mem.buffer)).getInt32(b + 4, true); b += 8; c -= 1; while (len > 0) { let c = new Uint8Array(mem.buffer)[base] s += String.fromCharCode(c); len -= 1; base += 1; total += 1; } } //print("fd_write(" + a + "): " + s.trimEnd()); print(s.trimEnd()); (new DataView(mem.buffer)).setInt32(d, total, true); return 0; }, fd_fdstat_ [message truncated]
cfallin commented on issue #6159:
We talked about this a good amount at today's Cranelift meeting. The basic issue is that the egraph optimization approach throws out the instruction order in the original code (for pure ops at least), and then recomputes an instruction order during elaboration. For most code this doesn't really matter much, and in fact this also opens the door to various sorts of code-motion, like our approach to rematerialization. But in carefully-interleaved pipelined SIMD inner loops, it does indeed hurt!
A few approaches we could take:
- We could try to somehow preserve the original order, if other factors (GVN, LICM, remat) don't force movement. Perhaps this is as simple as putting an "original sequence number" on nodes and then, within each basic block when we're done, sorting by that number. The tricky question here is what to do about newly-created nodes. Perhaps they get a number that is "root of current rule evaluation, minus epsilon" with care around how multiple nested invocations pick appropriate epsilons. Or perhaps the sort also respects data dependencies, and the sequence number is optional, so that the sequence numbers plus deps induce a partial order (and then we pick arbitrarily).
- We could try to build better heuristics for instruction scheduling. One approach we could take is to track a notion of register pressure, or at least "liverange pressure", within a block. When elaboration requires an op and it hasn't been computed yet, we have a range in which we can insert it: as high as just below the lowest def of an arg, down to just below the first use. We could try to greedily binpack liveranges to minimize cost.
- We could try to integrate instruction scheduling with regalloc. This is unfortunately not simple (it's the subject of research papers and at least a few theses!) but is sort of the holy grail at least for register pressure-related instruction scheduling issues.
In the above I'm assuming, based on what Alex as indicated, that the main issue is increased register pressure leading to extra spills. Scheduling for microarchitectural properties is a different issue and we could in theory try to do something about it one day as well, but it seems less important in the age of large out-of-order windows.
I'm kind of inclined to suggest trying the first: it's actually pretty simple and it might yield "good enough" results on the common case of one inner loop body where we don't have much effect on already-optimized code. Thoughts?
Last updated: Dec 23 2024 at 12:05 UTC