Speed up Websock receive queue reads via DataView#2024
Speed up Websock receive queue reads via DataView#2024PekingSpades wants to merge 4 commits intonovnc:masterfrom
Conversation
|
|
This looks interesting. It looks like a lot of testing was involved, I'm curious about the process here. How was this improvement discovered? And how did you go about testning? |
|
Hi! Happy to share the background and testing process. @samhed How this improvement was discoveredI noticed a small detail while reading That TODO caught my attention because my team is also building a high-performance remote control product, and our controller side is Web-based as well. I’ve been digging into VNC and noVNC to learn how the high-throughput message buffering is done, and How I went about testingThe benchmarking approach was straightforward:
Unfortunately I can’t locate the exact script anymore (I didn’t preserve it at the time), but the structure was roughly:
Why so many devices/browsers?The main reason I tested across so many machines/browsers was to rule out “this TODO exists for a reason” scenarios — e.g. historical browser compatibility issues, engine-specific slow paths, etc. In practice I didn’t see compatibility problems, and the improvement was consistently measurable. For older Safari versions, I didn’t have an old macOS machine available, so I used LambdaTest (their free quota) to cover those versions. OutcomeAcross the devices/browsers I tested, the change showed a clear performance improvement and I didn’t observe regressions or compatibility issues in the test matrix I ran. If it would help, I can recreate a new minimal benchmark script and share it in the PR so others can reproduce/extend the testing going forward. |
|
It sounds like you only tested JS code snippets out of context from the rest of the noVNC code? Or am I misunderstanding? Did you do any manual "real-life" testing as well? Yes, please share a benchmarking script, preferably similar to the one you used. |
(() => {
const SIZE = 64 * 1024 * 1024; // 64MB
const ROUNDS = 10;
const BYTES_LIST = [1, 2, 4];
const hasPerf = typeof performance !== "undefined" && typeof performance.now === "function";
const now = hasPerf ? () => performance.now() : () => Date.now();
const timerName = hasPerf ? "performance.now()" : "Date.now()";
const buf = new ArrayBuffer(SIZE);
const u8 = new Uint8Array(buf);
const dv = new DataView(buf);
for (let i = 0; i < u8.length; i++) {
u8[i] = i & 0xFF;
}
let _rQ = u8;
let _rQi = 0;
function _rQshift_loop(bytes) {
let res = 0;
for (let byte = bytes - 1; byte >= 0; byte--) {
res += _rQ[_rQi++] << (byte * 8);
}
return res >>> 0;
}
let _rQiDV = 0;
function _rQshift_dataview(bytes) {
let res;
if (bytes === 1) {
res = dv.getUint8(_rQiDV);
} else if (bytes === 2) {
res = dv.getUint16(_rQiDV, false);
} else if (bytes === 4) {
res = dv.getUint32(_rQiDV, false);
} else {
throw new Error("only support 1/2/4 bytes");
}
_rQiDV += bytes;
return res >>> 0;
}
// ===== benchmark =====
const results = []; // { method, bytes, round, timeMs }
function bench(bytes) {
const iterations = (SIZE / bytes) | 0;
let dummy = 0;
for (let round = 1; round <= ROUNDS; round++) {
// loop
_rQi = 0;
let t0 = now();
for (let i = 0; i < iterations; i++) {
dummy ^= _rQshift_loop(bytes);
}
let t1 = now();
results.push({ method: "loop", bytes, round, timeMs: t1 - t0 });
// DataView
_rQiDV = 0;
t0 = now();
for (let i = 0; i < iterations; i++) {
dummy ^= _rQshift_dataview(bytes);
}
t1 = now();
results.push({ method: "DataView", bytes, round, timeMs: t1 - t0 });
}
globalThis.__benchmarkDummy = dummy;
}
BYTES_LIST.forEach(bench);
function summarize(method, bytes) {
const rows = results.filter(r => r.method === method && r.bytes === bytes);
const times = rows.map(r => r.timeMs);
const sum = times.reduce((a, b) => a + b, 0);
const avg = sum / times.length;
const min = Math.min(...times);
const max = Math.max(...times);
return { method, bytes, rounds: rows.length, avg, min, max };
}
const summaries = [];
["loop", "DataView"].forEach(method => {
BYTES_LIST.forEach(bytes => {
summaries.push(summarize(method, bytes));
});
});
const winners = {}; // { [bytes]: "loop" | "DataView" | "tie" }
BYTES_LIST.forEach(bytes => {
const sLoop = summaries.find(s => s.bytes === bytes && s.method === "loop");
const sDV = summaries.find(s => s.bytes === bytes && s.method === "DataView");
if (!sLoop || !sDV) return;
if (Math.abs(sLoop.avg - sDV.avg) < 1e-6) {
winners[bytes] = "tie";
} else if (sLoop.avg < sDV.avg) {
winners[bytes] = "loop";
} else {
winners[bytes] = "DataView";
}
});
const envPairs = [];
function addEnvPair(key, value) {
if (value === undefined || value === null) return;
const v = String(value).replace(/\|/g, "\\|");
envPairs.push({ key, value: v });
}
// Config
addEnvPair("Buffer size (bytes)", SIZE);
addEnvPair("Buffer size (MB)", (SIZE / (1024 * 1024)).toFixed(1));
addEnvPair("Rounds per case", ROUNDS);
addEnvPair("Bytes tested", BYTES_LIST.join(", "));
addEnvPair("Timer", timerName);
// Client Info
try {
addEnvPair("User agent", navigator.userAgent);
addEnvPair("Platform", navigator.platform);
addEnvPair("HW concurrency", navigator.hardwareConcurrency);
addEnvPair("Device memory (GB)", navigator.deviceMemory);
addEnvPair("Language", navigator.language);
addEnvPair("Languages", navigator.languages && navigator.languages.join(", "));
} catch (e) {}
try {
addEnvPair("Screen resolution", `${screen.width}x${screen.height}`);
addEnvPair("Screen pixel depth", screen.pixelDepth);
} catch (e) {}
try {
if (hasPerf && performance && performance.memory) {
addEnvPair("JS heap size limit (MB)", (performance.memory.jsHeapSizeLimit / (1024 * 1024)).toFixed(1));
addEnvPair("Total JS heap (MB)", (performance.memory.totalJSHeapSize / (1024 * 1024)).toFixed(1));
addEnvPair("Used JS heap (MB)", (performance.memory.usedJSHeapSize / (1024 * 1024)).toFixed(1));
}
if (hasPerf && performance && typeof performance.timeOrigin === "number") {
addEnvPair("Performance timeOrigin", performance.timeOrigin);
}
} catch (e) {}
let md = "";
// Config + Client Info
md += `## Config & Client Info\n\n`;
md += `| Key | Value | Key | Value |\n`;
md += `| --- | ----- | --- | ----- |\n`;
for (let i = 0; i < envPairs.length; i += 2) {
const a = envPairs[i];
const b = envPairs[i + 1];
md += `| ${a.key} | ${a.value} | ${b ? b.key : ""} | ${b ? b.value : ""} |\n`;
}
md += `\n`;
md += `## Result\n\n`;
md += `| Bytes | Method | Rounds | Avg ms | Min ms | Max ms | Winner |\n`;
md += `| ----- | -------- | ------ | ------ | ------ | ------ | ------ |\n`;
BYTES_LIST.forEach(bytes => {
const sLoop = summaries.find(s => s.bytes === bytes && s.method === "loop");
const sDV = summaries.find(s => s.bytes === bytes && s.method === "DataView");
const winner = winners[bytes];
const loopWinEmoji =
winner === "loop" ? "🏆" :
winner === "tie" ? "⚖️" : "";
const dvWinEmoji =
winner === "DataView" ? "🏆" :
winner === "tie" ? "⚖️" : "";
if (sLoop) {
md += `| ${bytes} | loop | ${sLoop.rounds} | ${sLoop.avg.toFixed(3)} | ${sLoop.min.toFixed(3)} | ${sLoop.max.toFixed(3)} | ${loopWinEmoji} |\n`;
}
if (sDV) {
md += `| ${bytes} | DataView | ${sDV.rounds} | ${sDV.avg.toFixed(3)} | ${sDV.min.toFixed(3)} | ${sDV.max.toFixed(3)} | ${dvWinEmoji} |\n`;
}
});
md += `\n`;
console.log(md);
})(); |
|
|
My earlier numbers were mostly from isolated JS benchmarks, not a full live VNC-session benchmark. I’ve now added two browser-level checks to the PR that exercise noVNC itself rather than standalone snippets:
In the parser-focused configuration of that benchmark (display work stubbed so the measurement stays attributable to the receive path changed by this PR), I’m seeing about 30-34% improvement versus current master on repeated runs on my machine. I also ran the same protocol stream with display work enabled. There the total time was essentially unchanged, which is why I think this particular optimization is hard to validate with end-to-end “real-life session” timing alone: once rendering is included, the receive-path signal gets swamped by display cost. So the short answer is: I had not originally done a good live-session benchmark, but I have now added browser-level smoke/perf scripts to the PR that run noVNC in context rather than just isolated snippets. They can be rerun locally with |
Summary
core/websock.js:_rQshift()with a DataView-backed fast path for 1/2/4 byte reads to cut CPU time in the receive queue.Performance Summary
Average speed-up = mean reduction in the 1/2/4-byte benchmark cases (higher is better).
Testing
Benchmark Results
Windows Chrome 142
Windows Chrome 142(Machine 2)
Windows Chrome 101
Windows Chrome 92.0
Windows Chrome 83.0
Windows Chrome 71.0
Windows Edge 142
Windows Edge 142(Machine 2)
Windows Firefox 113
Windows Firefox 142
Windows Firefox 145.0
Safari 18
Karma Test
Can I use
https://caniuse.com/mdn-javascript_builtins_dataview