gfx-rs
diff --git a/‎Cargo.lock‎
Lines changed: 5 additions & 117 deletions b/‎Cargo.lock‎
Lines changed: 5 additions & 117 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benches/Cargo.toml‎
Lines changed: 8 additions & 5 deletions b/‎benches/Cargo.toml‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎benches/README.md‎
Lines changed: 47 additions & 49 deletions b/‎benches/README.md‎
Lines changed: 47 additions & 49 deletions
@@ -250,7 +250,7 @@ deno_webgpu = { version = "0.181.0", path = "./deno_webgpu" }
 deno_unsync = "0.4.4"
 deno_error = "0.7.0"
 tokio = "1.47"
-termcolor = "1.1.3"
+termcolor = "1.4.1"
 
 # android dependencies
 ndk-sys = "0.6"
 
@@ -16,19 +16,18 @@ name = "wgpu-benchmark"
 harness = false
 
 [features]
-# Uncomment these features to enable tracy and superluminal profiling.
-# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
-# superluminal = ["profiling/profile-with-superluminal"]
+tracy = ["dep:tracy-client"]
 
 [lints.rust]
 unexpected_cfgs = { level = "warn", check-cfg = [
     'cfg(feature, values("tracy"))',
 ] }
 
-[dependencies]
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+anyhow.workspace = true
 bincode = { workspace = true, features = ["serde"] }
 bytemuck.workspace = true
-criterion.workspace = true
+# criterion.workspace = true
 naga = { workspace = true, features = [
     "deserialize",
     "serialize",
@@ -43,8 +42,12 @@ naga = { workspace = true, features = [
 ] }
 naga-test = { workspace = true, features = [] }
 nanorand.workspace = true
+pico-args.workspace = true
 pollster.workspace = true
 profiling.workspace = true
 rayon.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+termcolor.workspace = true
 tracy-client = { workspace = true, optional = true }
 wgpu.workspace = true
@@ -1,9 +1,6 @@
 Collection of CPU benchmarks for `wgpu`.
 
 These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
-They all do very little GPU work and are testing the CPU performance of the API.
-
-Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
 
 ## Usage
 
@@ -14,73 +11,38 @@ cargo bench -p wgpu-benchmark
 cargo bench -p wgpu-benchmark -- "filter"
 ```
 
-## Benchmarks
-
-#### `Renderpass`
-
-This benchmark measures the performance of recording and submitting a render pass with a large
-number of draw calls and resources, emulating an intense, more traditional graphics application. 
-By default it measures 10k draw calls, with 90k total resources.
-
-Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
-the render pass into multiple passes over multiple command buffers.
-If available, it also tests a bindless approach, binding all textures at once instead of switching
-the bind group for every draw call.
-
-#### `Computepass`
-
-This benchmark measures the performance of recording and submitting a compute pass with a large
-number of dispatches and resources.
-By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
-
-Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
-the compute pass into multiple passes over multiple command buffers.
-If available, it also tests a bindless approach, binding all resources at once instead of switching
-the bind group for every draw call.
-TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
-
-
-#### `Resource Creation`
-
-This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
-
-#### `Shader Compilation`
-
-This benchmark measures the performance of naga parsing, validating, and generating shaders. 
+Use `WGPU_BACKEND` and `WGPU_ADAPTER_NAME` to adjust which device the benchmarks use. [More info on env vars](../README.md#environment-variables).
 
 ## Comparing Against a Baseline
 
 To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
 
-For example, to compare v0.20 against trunk, you could run the following:
+For example, to compare v28 against trunk, you could run the following:
 
 ```sh
-git checkout v0.20
-
+git checkout v28
 # Run the baseline benchmarks
-cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
+cargo bench -p wgpu-benchmark -- --save-baseline "v28"
 
 git checkout trunk
-
 # Run the current benchmarks
-cargo bench -p wgpu-benchmark -- --baseline "v0.20"
+cargo bench -p wgpu-benchmark -- --baseline "v28"
 ```
 
-You can use this for any bits of code you want to compare.
+The current benchmarking framework was added before v28, so comparisons only work after it was added. Before that the same commands will work, but comparison will be done using `criterion`.
 
 ## Integration with Profilers
 
 The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
-Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
-you need to uncomment the features in the `Cargo.toml` to allow features to be used.
+Integrations are available for `tracy` and `superluminal`.
 
 #### Tracy
 
 Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/tracy/releases/latest/).
 
 ```sh
 # Once this is running, you can connect to it with the Tracy Profiler
-cargo bench -p wgpu-benchmark --features tracy
+cargo bench -p wgpu-benchmark --features tracy,profiling/profile-with-tracy
 ```
 
 #### Superluminal
@@ -89,10 +51,10 @@ Superluminal is a paid product for windows available [here](https://superluminal
 
 ```sh
 # This command will build the benchmarks, and display the path to the executable
-cargo bench -p wgpu-benchmark --features superluminal -- -h
+cargo bench -p wgpu-benchmark --features profiling/profile-with-superluminal -- -h
 
 # Have Superluminal run the following command (replacing with the path to the executable)
-./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
+<path_to_exe> --bench "filter"
 ```
 
 #### `perf` and others
@@ -105,6 +67,42 @@ For example, the command line tool `perf` can be used to profile the benchmarks.
 cargo bench -p wgpu-benchmark -- -h
 
 # Run the benchmarks with perf
-perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
+perf record <path_to_exe> --bench "filter"
 ```
 
+## Benchmarks
+
+#### `Renderpass Encoding`
+
+This benchmark measures the performance of recording and submitting a render pass with a large
+number of draw calls and resources, emulating an intense, more traditional graphics application. 
+By default it measures 10k draw calls, with 90k total resources.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the render pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all textures at once instead of switching
+the bind group for every draw call.
+
+#### `Computepass Encoding`
+
+This benchmark measures the performance of recording and submitting a compute pass with a large
+number of dispatches and resources.
+By default it measures 10k dispatch calls, with 60k total resources, emulating an unusually complex and sequential compute workload.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the compute pass into multiple passes over multiple command buffers.
+If available, it also tests a bindless approach, binding all resources at once instead of switching
+the bind group for every draw call.
+TODO(https://github.com/gfx-rs/wgpu/issues/5766): The bindless version uses only 1k dispatches with 6k resources since it would be too slow for a reasonable benchmarking time otherwise.
+
+#### `Device::create_buffer`
+
+This benchmark measures the performance of creating large buffers.
+
+#### `Device::create_bind_group`
+
+This benchmark measures the performance of creating large bind groups of 5 to 50,000 resources.
+
+#### `naga::back`, `naga::compact`, `naga::front`, and `naga::valid`
+
+These benchmark measures the performance of naga parsing, validating, and generating shaders.