Skip to content

Commit 698cbd0

Browse files
authored
benchmark: add calibrate-n script
This script should help identify the best N when creating/updating benchmarks Signed-off-by: RafaelGSS <[email protected]> PR-URL: #59186 Reviewed-By: Vinícius Lourenço Claro Cardoso <[email protected]> Reviewed-By: James M Snell <[email protected]>
1 parent af77e4b commit 698cbd0

File tree

2 files changed

+333
-0
lines changed

2 files changed

+333
-0
lines changed

benchmark/calibrate-n.js

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
'use strict';
2+
3+
const path = require('node:path');
4+
const { fork } = require('node:child_process');
5+
const fs = require('node:fs');
6+
const { styleText } = require('node:util');
7+
8+
const DEFAULT_RUNS = 30; // Number of runs for each n value
9+
const CV_THRESHOLD = 0.05; // 5% coefficient of variation threshold
10+
const MAX_N_INCREASE = 6; // Maximum number of times to increase n (10**6)
11+
const INCREASE_FACTOR = 10; // Factor by which to increase n
12+
13+
const args = process.argv.slice(2);
14+
if (args.length === 0) {
15+
console.log(`
16+
Usage: node calibrate-n.js [options] <benchmark_path>
17+
18+
Options:
19+
--runs=N Number of runs for each n value (default: ${DEFAULT_RUNS})
20+
--cv-threshold=N Target coefficient of variation threshold (default: ${CV_THRESHOLD})
21+
--max-increases=N Maximum number of n increases to try (default: ${MAX_N_INCREASE})
22+
--start-n=N Initial n value to start with (default: autodetect)
23+
--increase=N Factor by which to increase n (default: ${INCREASE_FACTOR})
24+
25+
Example:
26+
node calibrate-n.js buffers/buffer-compare.js
27+
node calibrate-n.js --runs=10 --cv-threshold=0.02 buffers/buffer-compare.js
28+
`);
29+
process.exit(1);
30+
}
31+
32+
// Extract options
33+
let benchmarkPath;
34+
let runs = DEFAULT_RUNS;
35+
let cvThreshold = CV_THRESHOLD;
36+
let maxIncreases = MAX_N_INCREASE;
37+
let startN = 10;
38+
let increaseFactor = INCREASE_FACTOR;
39+
40+
for (const arg of args) {
41+
if (arg.startsWith('--runs=')) {
42+
runs = parseInt(arg.substring(7), 10);
43+
} else if (arg.startsWith('--cv-threshold=')) {
44+
cvThreshold = parseFloat(arg.substring(14));
45+
} else if (arg.startsWith('--max-increases=')) {
46+
maxIncreases = parseInt(arg.substring(15), 10);
47+
if (isNaN(maxIncreases)) {
48+
console.error(`Error: Invalid value for --max-increases. Using default: ${MAX_N_INCREASE}`);
49+
maxIncreases = MAX_N_INCREASE;
50+
}
51+
} else if (arg.startsWith('--start-n=')) {
52+
startN = parseInt(arg.substring(10), 10);
53+
if (isNaN(startN)) {
54+
console.error(`Error: Invalid value for --start-n. Using default: 10`);
55+
startN = 10;
56+
}
57+
} else if (arg.startsWith('--increase=')) {
58+
increaseFactor = parseInt(arg.substring(11), 10);
59+
if (isNaN(increaseFactor)) {
60+
console.error(`Error: Invalid value for --increase. Using default: ${INCREASE_FACTOR}`);
61+
increaseFactor = INCREASE_FACTOR;
62+
}
63+
} else {
64+
benchmarkPath = arg;
65+
}
66+
}
67+
68+
if (!benchmarkPath) {
69+
console.error('Error: No benchmark path specified');
70+
process.exit(1);
71+
}
72+
73+
const fullBenchmarkPath = path.resolve(benchmarkPath);
74+
if (!fs.existsSync(fullBenchmarkPath)) {
75+
console.error(`Error: Benchmark file not found: ${fullBenchmarkPath}`);
76+
process.exit(1);
77+
}
78+
79+
function calculateStats(values) {
80+
const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
81+
82+
const squaredDiffs = values.map((val) => {
83+
const diff = val - mean;
84+
const squared = diff ** 2;
85+
return squared;
86+
});
87+
88+
const variance = squaredDiffs.reduce((sum, val) => sum + val, 0) / values.length;
89+
const stdDev = Math.sqrt(variance);
90+
const cv = stdDev / mean;
91+
92+
return { mean, stdDev, cv, variance };
93+
}
94+
95+
function runBenchmark(n) {
96+
return new Promise((resolve, reject) => {
97+
const child = fork(
98+
fullBenchmarkPath,
99+
[`n=${n}`],
100+
{ stdio: ['inherit', 'pipe', 'inherit', 'ipc'] },
101+
);
102+
103+
const results = [];
104+
child.on('message', (data) => {
105+
if (data.type === 'report' && data.rate && data.conf) {
106+
results.push({
107+
rate: data.rate,
108+
conf: data.conf,
109+
});
110+
}
111+
});
112+
113+
child.on('close', (code) => {
114+
if (code !== 0) {
115+
reject(new Error(`Benchmark exited with code ${code}`));
116+
} else {
117+
resolve(results);
118+
}
119+
});
120+
});
121+
}
122+
123+
async function main(n = startN) {
124+
let increaseCount = 0;
125+
let bestN = n;
126+
let bestCV = Infinity;
127+
let bestGroupStats = null;
128+
129+
console.log(`
130+
--------------------------------------------------------
131+
Benchmark: ${benchmarkPath}
132+
--------------------------------------------------------
133+
What we are trying to find: The optimal number of iterations (n)
134+
that produces consistent benchmark results without wasting time.
135+
136+
How it works:
137+
1. Run the benchmark multiple times with a specific n value
138+
2. Group results by configuration
139+
3. If overall CV is above 5% or any configuration has CV above 10%, increase n and try again
140+
141+
Configuration:
142+
- Starting n: ${n.toLocaleString()} iterations
143+
- Runs per n value: ${runs}
144+
- Target CV threshold: ${cvThreshold * 100}% (lower CV = more stable results)
145+
- Max increases: ${maxIncreases}
146+
- Increase factor: ${increaseFactor}x`);
147+
148+
while (increaseCount < maxIncreases) {
149+
console.log(`\nTesting with n=${n}:`);
150+
151+
const resultsData = [];
152+
for (let i = 0; i < runs; i++) {
153+
const results = await runBenchmark(n);
154+
// Each run might return multiple results (one per configuration)
155+
if (Array.isArray(results) && results.length > 0) {
156+
resultsData.push(...results);
157+
} else if (results) {
158+
resultsData.push(results);
159+
}
160+
process.stdout.write('.');
161+
}
162+
process.stdout.write('\n');
163+
164+
const groupedResults = {};
165+
resultsData.forEach((result) => {
166+
if (!result || !result.conf) return;
167+
168+
const confKey = JSON.stringify(result.conf);
169+
groupedResults[confKey] ||= {
170+
conf: result.conf,
171+
rates: [],
172+
};
173+
174+
groupedResults[confKey].rates.push(result.rate);
175+
});
176+
177+
const groupStats = [];
178+
for (const [confKey, group] of Object.entries(groupedResults)) {
179+
console.log(`\nConfiguration: ${JSON.stringify(group.conf)}`);
180+
181+
const stats = calculateStats(group.rates);
182+
console.log(` CV: ${(stats.cv * 100).toFixed(2)}% (lower values mean more stable results)`);
183+
184+
const isStable = stats.cv <= cvThreshold;
185+
console.log(` Stability: ${isStable ?
186+
styleText(['bold', 'green'], '✓ Stable') :
187+
styleText(['bold', 'red'], '✗ Unstable')}`);
188+
189+
groupStats.push({
190+
confKey,
191+
stats,
192+
isStable,
193+
});
194+
}
195+
196+
if (groupStats.length > 0) {
197+
// Check if any configuration has CV > 10% (too unstable)
198+
const tooUnstableConfigs = groupStats.filter((g) => g.stats.cv > 0.10);
199+
200+
const avgCV = groupStats.reduce((sum, g) => sum + g.stats.cv, 0) / groupStats.length;
201+
console.log(`\nOverall average CV: ${(avgCV * 100).toFixed(2)}%`);
202+
203+
const isOverallStable = avgCV < CV_THRESHOLD;
204+
const hasVeryUnstableConfigs = tooUnstableConfigs.length > 0;
205+
206+
// Check if overall CV is below CV_THRESHOLD and no configuration has CV > 10%
207+
if (isOverallStable && !hasVeryUnstableConfigs) {
208+
console.log(styleText(['bold', 'green'], ` ✓ Overall CV is below 5% and no configuration has CV above 10%`));
209+
} else {
210+
if (!isOverallStable) {
211+
console.log(styleText(['bold', 'red'], ` ✗ Overall CV (${(avgCV * 100).toFixed(2)}%) is above 5%`));
212+
}
213+
if (hasVeryUnstableConfigs) {
214+
console.log(styleText(['bold', 'red'], ` ✗ ${tooUnstableConfigs.length} configuration(s) have CV above 10%`));
215+
}
216+
}
217+
218+
if (avgCV < bestCV || !bestGroupStats) {
219+
bestN = n;
220+
bestCV = avgCV;
221+
222+
bestGroupStats = [];
223+
for (const group of Object.values(groupedResults)) {
224+
if (group.rates.length >= 3) {
225+
const stats = calculateStats(group.rates);
226+
bestGroupStats.push({
227+
conf: group.conf,
228+
stats: stats,
229+
isStable: stats.cv <= 0.10,
230+
});
231+
}
232+
}
233+
console.log(` → New best n: ${n} with average CV: ${(avgCV * 100).toFixed(2)}%`);
234+
} else {
235+
console.log(` → Current best n remains: ${bestN} with average CV: ${(bestCV * 100).toFixed(2)}%`);
236+
}
237+
}
238+
239+
// Check if we've reached acceptable stability based on new criteria
240+
// 1. Overall CV should be below CV_THRESHOLD
241+
// 2. No configuration should have a CV greater than 10%
242+
const avgCV = groupStats.length > 0 ?
243+
groupStats.reduce((sum, g) => sum + g.stats.cv, 0) / groupStats.length : Infinity;
244+
const hasUnstableConfig = groupStats.some((g) => g.stats.cv > 0.10);
245+
const isOverallStable = avgCV < CV_THRESHOLD;
246+
247+
if (isOverallStable && !hasUnstableConfig) {
248+
console.log(`\n✓ Found optimal n=${n} (Overall CV=${(avgCV * 100).toFixed(2)}% < 5% and no configuration has CV > 10%)`);
249+
console.log('\nFinal CV for each configuration:');
250+
groupStats.forEach((g) => {
251+
console.log(` ${JSON.stringify(groupedResults[g.confKey].conf)}: ${(g.stats.cv * 100).toFixed(2)}%`);
252+
});
253+
254+
return n;
255+
}
256+
257+
increaseCount++;
258+
n *= increaseFactor;
259+
}
260+
261+
if (increaseCount >= maxIncreases) {
262+
const finalAvgCV = bestGroupStats && bestGroupStats.length > 0 ?
263+
bestGroupStats.reduce((sum, g) => sum + g.stats.cv, 0) / bestGroupStats.length : Infinity;
264+
265+
console.log(`Maximum number of increases (${maxIncreases}) reached without achieving target stability`);
266+
console.log(`Best n found: ${bestN} with average CV=${(finalAvgCV * 100).toFixed(2)}%`);
267+
console.log(`\nCV by configuration at best n:`);
268+
269+
if (bestGroupStats) {
270+
bestGroupStats.forEach((g) => {
271+
if (g.conf) {
272+
console.log(` ${JSON.stringify(g.conf)}: ${(g.stats.cv * 100).toFixed(2)}%`);
273+
if (g.stats.cv > cvThreshold) {
274+
console.log(` ⚠️ This configuration is above the target threshold of ${cvThreshold * 100}%`);
275+
}
276+
}
277+
});
278+
}
279+
}
280+
281+
console.log(`
282+
Recommendation: You might want to try increasing --max-increases to
283+
continue testing with larger n values, or adjust --cv-threshold to
284+
accept the current best result, or investigate if specific configurations
285+
are contributing to instability.`);
286+
return bestN;
287+
}
288+
289+
main().catch((err) => {
290+
console.error('Error:', err);
291+
process.exit(1);
292+
});

doc/contributing/writing-and-running-benchmarks.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* [Benchmark analysis requirements](#benchmark-analysis-requirements)
1010
* [Running benchmarks](#running-benchmarks)
1111
* [Running individual benchmarks](#running-individual-benchmarks)
12+
* [Calibrating the number of iterations with calibrate-n.js](#calibrating-the-number-of-iterations-with-calibrate-njs)
1213
* [Running all benchmarks](#running-all-benchmarks)
1314
* [Specifying CPU Cores for Benchmarks with run.js](#specifying-cpu-cores-for-benchmarks-with-runjs)
1415
* [Filtering benchmarks](#filtering-benchmarks)
@@ -142,6 +143,46 @@ buffers/buffer-tostring.js n=10000000 len=1024 arg=true: 3498295.68561504
142143
buffers/buffer-tostring.js n=10000000 len=1024 arg=false: 3783071.1678948295
143144
```
144145

146+
### Calibrating the number of iterations with calibrate-n.js
147+
148+
Before running benchmarks, it's often useful to determine the optimal number of iterations (`n`)
149+
that provides statistically stable results. The `calibrate-n.js` tool helps find this value by
150+
running a benchmark multiple times with increasing `n` values until the coefficient of variation (CV)
151+
falls below a target threshold.
152+
153+
```console
154+
$ node benchmark/calibrate-n.js benchmark/buffers/buffer-compare.js
155+
156+
--------------------------------------------------------
157+
Benchmark: buffers/buffer-compare.js
158+
--------------------------------------------------------
159+
What we are trying to find: The optimal number of iterations (n)
160+
that produces consistent benchmark results without wasting time.
161+
162+
How it works:
163+
1. Run the benchmark multiple times with a specific n value
164+
2. Group results by configuration
165+
3. If overall CV is above 5% or any configuration has CV above 10%, increase n and try again
166+
4. Stop when we have stable results (overall CV < 5% and all configs CV < 10%) or max increases reached
167+
168+
Configuration:
169+
- Starting n: 10 iterations
170+
- Runs per n value: 30
171+
- Target CV threshold: 5% (lower CV = more stable results)
172+
- Max increases: 6
173+
- Increase factor: 10x
174+
```
175+
176+
The tool accepts several options:
177+
178+
* `--runs=N`: Number of runs for each n value (default: 30)
179+
* `--cv-threshold=N`: Target coefficient of variation threshold (default: 0.05)
180+
* `--max-increases=N`: Maximum number of n increases to try (default: 6)
181+
* `--start-n=N`: Initial n value to start with (default: 10)
182+
* `--increase=N`: Factor by which to increase n (default: 10)
183+
184+
Once you've determined a stable `n` value, you can use it when running your benchmarks.
185+
145186
### Running all benchmarks
146187

147188
Similar to running individual benchmarks, a group of benchmarks can be executed

0 commit comments

Comments
 (0)