ROCm · neon60 · Sep 24, 2025 · Sep 12, 2025 · Sep 23, 2025 · Oct 9, 2025
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -17,4 +17,4 @@ on:
 jobs:
   call-workflow-passing-data:
     name: Documentation
-    uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop
+    uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@vale_check
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,3 @@
-.*
 !.gitignore
 !.spellcheck.local.yaml
 *.o

diff --git a/.vale.ini b/.vale.ini
@@ -0,0 +1,58 @@
+# ==========================================
+# Vale configuration for Markdown + RST
+# ==========================================
+
+# Path to custom or downloaded style packages
+# You can point to `.github/styles` or another shared directory
+StylesPath = .github/styles
+
+# The minimum alert level to display
+# (suggestion, warning, or error)
+MinAlertLevel = suggestion
+
+# By default, Vale will lint all recognized file types.
+# You can override or specify formats here.
+[*.{md,rst}]
+BasedOnStyles = Vale, Google, Microsoft
+
+# ==========================================
+# Markdown-specific rules
+# ==========================================
+[*.md]
+# You can disable or tweak specific Markdown rules
+# Examples:
+TokenIgnores = (\{\{.*\}\})        # Ignore templating syntax
+BlockIgnores = (?s)```.*?```       # Ignore fenced code blocks
+
+# Customize rules if needed
+# Example: disable long sentence warnings
+Vale.Terms = YES
+Google.Headings = YES
+Google.FirstPerson = NO
+Google.We = NO
+Google.Passive = NO
+
+# ==========================================
+# RST-specific rules
+# ==========================================
+[*.rst]
+# Ensure docutils is installed for parsing in CI
+# Disable Markdown-specific rules if they trigger false positives
+TokenIgnores = (:ref:`.*`|:doc:`.*`|``.*``)
+BlockIgnores = (?s)\.\..*::.*\n(?:[ \t]+.*\n)*
+
+BasedOnStyles = Vale, Google, Microsoft
+Google.Headings = NO       # RST doesn't use Markdown-style headings
+Google.We = NO
+Google.Passive = YES
+Microsoft.Spacing = YES
+Microsoft.Acronyms = YES
+
+# ==========================================
+# File-specific exclusions (optional)
+# ==========================================
+[CHANGELOG.md]
+BasedOnStyles = Vale       # Skip strict style rules for changelogs
+
+[README.md]
+BasedOnStyles = Vale, Google
diff --git a/.wordlist.txt b/.wordlist.txt
@@ -39,6 +39,8 @@ Dereferencing
 DFT
 dll
 DirectX
+DPP
+dst
 EIGEN
 enqueue
 enqueues

diff --git a/docs/how-to/hip_cpp_language_extensions.rst b/docs/how-to/hip_cpp_language_extensions.rst
@@ -103,66 +103,10 @@ The kernel arguments are listed after the configuration parameters.
 
 .. code-block:: cpp
 
-  #include <hip/hip_runtime.h>
-  #include <iostream>
-
-  #define HIP_CHECK(expression)                                \
-  {                                                            \
-      const hipError_t err = expression;                       \
-      if(err != hipSuccess){                                   \
-          std::cerr << "HIP error: " << hipGetErrorString(err) \
-              << " at " << __LINE__ << "\n";                   \
-      }                                                        \
-  }
-
-  // Performs a simple initialization of an array with the thread's index variables.
-  // This function is only available in device code.
-  __device__ void init_array(float * const a, const unsigned int arraySize){
-    // globalIdx uniquely identifies a thread in a 1D launch configuration.
-    const int globalIdx = threadIdx.x + blockIdx.x * blockDim.x;
-    // Each thread initializes a single element of the array.
-    if(globalIdx < arraySize){
-      a[globalIdx] = globalIdx;
-    }
-  }
-
-  // Rounds a value up to the next multiple.
-  // This function is available in host and device code.
-  __host__ __device__ constexpr int round_up_to_nearest_multiple(int number, int multiple){
-    return (number + multiple - 1)/multiple;
-  }
-
-  __global__ void example_kernel(float * const a, const unsigned int N)
-  {
-    // Initialize array.
-    init_array(a, N);
-    // Perform additional work:
-    // - work with the array
-    // - use the array in a different kernel
-    // - ...
-  }
-
-  int main()
-  {
-    constexpr int N = 100000000; // problem size
-    constexpr int blockSize = 256; //configurable block size
-
-    //needed number of blocks for the given problem size
-    constexpr int gridSize = round_up_to_nearest_multiple(N, blockSize);
-
-    float *a;
-    // allocate memory on the GPU
-    HIP_CHECK(hipMalloc(&a, sizeof(*a) * N));
-
-    std::cout << "Launching kernel." << std::endl;
-    example_kernel<<<dim3(gridSize), dim3(blockSize), 0/*example doesn't use shared memory*/, 0/*default stream*/>>>(a, N);
-    // make sure kernel execution is finished by synchronizing. The CPU can also
-    // execute other instructions during that time
-    HIP_CHECK(hipDeviceSynchronize());
-    std::cout << "Kernel execution finished." << std::endl;
-
-    HIP_CHECK(hipFree(a));
-  }
+  .. literalinclude:: ../tools/example_codes/calling_global_functions.hip
+      :start-after: // [sphinx-start]
+      :end-before: // [sphinx-end]
+      :language: cpp
 
 Inline qualifiers
 --------------------------------------------------------------------------------
@@ -321,28 +265,10 @@ launch has to specify the needed amount of ``extern`` shared memory in the launc
 configuration. The statically allocated shared memory is allocated without this
 parameter.
 
-.. code-block:: cpp
-
-  #include <hip/hip_runtime.h>
-
-  extern __shared__ int shared_array[];
-
-  __global__ void kernel(){
-    // initialize shared memory
-    shared_array[threadIdx.x] = threadIdx.x;
-    // use shared memory - synchronize to make sure, that all threads of the
-    // block see all changes to shared memory
-    __syncthreads();
-  }
-
-  int main(){
-    //shared memory in this case depends on the configurable block size
-    constexpr int blockSize = 256;
-    constexpr int sharedMemSize = blockSize * sizeof(int);
-    constexpr int gridSize = 2;
-
-    kernel<<<dim3(gridSize), dim3(blockSize), sharedMemSize, 0>>>();
-  }
+.. literalinclude:: ../tools/example_codes/extern_shared_memory.hip
+    :start-after: // [sphinx-start]
+    :end-before: // [sphinx-end]
+    :language: cpp
 
 __managed__
 --------------------------------------------------------------------------------
@@ -735,22 +661,18 @@ with the actual frequency.
 
 The difference between the returned values represents the cycles used.
 
-.. code-block:: cpp
-
-  __global void kernel(){
-    long long int start = clock64();
-    // kernel code
-    long long int stop = clock64();
-    long long int cycles = stop - start;
-  }
+.. literalinclude:: ../tools/example_codes/timer.hip
+    :start-after: // [sphinx-kernel-start]
+    :end-before: // [sphinx-kernel-end]
+    :language: cpp
 
 ``long long int wall_clock64()`` returns the wall clock time on the device, with a constant, fixed frequency.
 The frequency is device dependent and can be queried using:
 
-.. code-block:: cpp
-
-  int wallClkRate = 0; //in kilohertz
-  hipDeviceGetAttribute(&wallClkRate, hipDeviceAttributeWallClockRate, deviceId);
+.. literalinclude:: ../tools/example_codes/timer.hip
+    :start-after: // [sphinx-query-start]
+    :end-before: // [sphinx-query-end]
+    :language: cpp
 
 .. _atomic functions:
 

diff --git a/docs/how-to/hip_rtc.rst b/docs/how-to/hip_rtc.rst
@@ -319,31 +319,42 @@ using the bitcode APIs provided by HIPRTC.
   vector<char> kernel_bitcode(bitCodeSize);
   hiprtcGetBitcode(prog, kernel_bitcode.data());
 
-CU Mode vs WGP mode
+CU mode vs WGP mode
 -------------------------------------------------------------------------------
 
-AMD GPUs consist of an array of workgroup processors, each built with 2 compute
-units (CUs) capable of executing SIMD32. All the CUs inside a workgroup
-processor use local data share (LDS).
+All :doc:`supported AMD GPUs <rocm-install-on-linux:reference/system-requirements>` are built around a data-parallel
+processor (DPP) array.
 
-gfx10+ support execution of wavefront in CU mode and work-group processor mode
-(WGP). Please refer to section 2.3 of `RDNA3 ISA reference <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_.
+On CDNA GPUs, the DPP is organized as a set of compute unit (CU) pipelines, with each CU containing a single SIMD64
+unit. Each CU has its own low-latency memory space called local data share (LDS), which threads from a warp running on
+the CU can access.
 
-gfx9 and below only supports CU mode.
+On RDNA GPUs, the DPP is organized as a set of workgroup processor (WGP) pipelines. Each WGP contains two CUs, and each
+CU contains two SIMD32 units. The LDS is attached to the WGP, so threads from different warps can access the same LDS if
+they run on CUs within the same WGP.
 
-In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup
-processor, where as in CU mode only 2 warps of a block can simultaneously
-execute on a CU. In theory, WGP mode might help with occupancy and increase the
-performance of certain HIP programs (if not bound to inter warp communication),
-but might incur performance penalty on other HIP programs which rely on atomics
-and inter warp communication. This also has effect of how the LDS is split
-between warps, please refer to `RDNA3 ISA reference <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_ for more information.
+.. note::
+
+  Because CDNA GPUs do not use workgroup processors and have a different CU layout, the following information applies
+  only to RDNA GPUs.
+
+Warps are dispatched in one of two modes. These control whether warps are distributed across two SIMD32s (**CU mode**)
+or across all four SIMD32s within a WGP (**WGP mode**).
+
+CU mode executes two warps per block on a single CU and provides only half the LDS to those warps. Independence between
+CUs can improve performance for workloads avoiding inter-warp communication, but LDS capacity per CU is limited.
+
+WGP mode executes four warps per block on a WGP with a shared LDS. It can increase occupancy and improve performance
+for workloads without heavy inter-warp communication, but it can degrade performance for programs relying on atomics or
+extensive inter-warp communication.
+
+For more information on the differences between CU and WGP modes, please refer to the appropriate ISA reference under
+`AMD RDNA architecture <https://gpuopen.com/amd-gpu-architecture-programming-documentation/>`__.
 
 .. note::
 
-  HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by
-  passing ``-mcumode`` to HIPRTC compile options in
-  :cpp:func:`hiprtcCompileProgram`.
+  HIPRTC assumes **WGP mode by default** for RDNA GPUs. This can be overridden by passing ``-mcumode`` as a compile
+  option in :cpp:func:`hiprtcCompileProgram`.
 
 Linker APIs
 ===============================================================================
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,6 +39,8 @@ Dereferencing @@
     DFT
     dll
     DirectX
+    DPP
+    dst
     EIGEN
     enqueue
     enqueues
@@ Expand Down @@