Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ on:
jobs:
call-workflow-passing-data:
name: Documentation
uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop
uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@vale_check
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
.*
!.gitignore
!.spellcheck.local.yaml
*.o
Expand Down
58 changes: 58 additions & 0 deletions .vale.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# ==========================================
# Vale configuration for Markdown + RST
# ==========================================

# Path to custom or downloaded style packages
# You can point to `.github/styles` or another shared directory
StylesPath = .github/styles

# The minimum alert level to display
# (suggestion, warning, or error)
MinAlertLevel = suggestion

# By default, Vale will lint all recognized file types.
# You can override or specify formats here.
[*.{md,rst}]
BasedOnStyles = Vale, Google, Microsoft

# ==========================================
# Markdown-specific rules
# ==========================================
[*.md]
# You can disable or tweak specific Markdown rules
# Examples:
TokenIgnores = (\{\{.*\}\}) # Ignore templating syntax
BlockIgnores = (?s)```.*?``` # Ignore fenced code blocks

# Customize rules if needed
# Example: disable long sentence warnings
Vale.Terms = YES
Google.Headings = YES
Google.FirstPerson = NO
Google.We = NO
Google.Passive = NO

# ==========================================
# RST-specific rules
# ==========================================
[*.rst]
# Ensure docutils is installed for parsing in CI
# Disable Markdown-specific rules if they trigger false positives
TokenIgnores = (:ref:`.*`|:doc:`.*`|``.*``)
BlockIgnores = (?s)\.\..*::.*\n(?:[ \t]+.*\n)*

BasedOnStyles = Vale, Google, Microsoft
Google.Headings = NO # RST doesn't use Markdown-style headings
Google.We = NO
Google.Passive = YES
Microsoft.Spacing = YES
Microsoft.Acronyms = YES

# ==========================================
# File-specific exclusions (optional)
# ==========================================
[CHANGELOG.md]
BasedOnStyles = Vale # Skip strict style rules for changelogs

[README.md]
BasedOnStyles = Vale, Google
2 changes: 2 additions & 0 deletions .wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ Dereferencing
DFT
dll
DirectX
DPP
dst
EIGEN
enqueue
enqueues
Expand Down
110 changes: 16 additions & 94 deletions docs/how-to/hip_cpp_language_extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,66 +103,10 @@ The kernel arguments are listed after the configuration parameters.

.. code-block:: cpp

#include <hip/hip_runtime.h>
#include <iostream>

#define HIP_CHECK(expression) \
{ \
const hipError_t err = expression; \
if(err != hipSuccess){ \
std::cerr << "HIP error: " << hipGetErrorString(err) \
<< " at " << __LINE__ << "\n"; \
} \
}

// Performs a simple initialization of an array with the thread's index variables.
// This function is only available in device code.
__device__ void init_array(float * const a, const unsigned int arraySize){
// globalIdx uniquely identifies a thread in a 1D launch configuration.
const int globalIdx = threadIdx.x + blockIdx.x * blockDim.x;
// Each thread initializes a single element of the array.
if(globalIdx < arraySize){
a[globalIdx] = globalIdx;
}
}

// Rounds a value up to the next multiple.
// This function is available in host and device code.
__host__ __device__ constexpr int round_up_to_nearest_multiple(int number, int multiple){
return (number + multiple - 1)/multiple;
}

__global__ void example_kernel(float * const a, const unsigned int N)
{
// Initialize array.
init_array(a, N);
// Perform additional work:
// - work with the array
// - use the array in a different kernel
// - ...
}

int main()
{
constexpr int N = 100000000; // problem size
constexpr int blockSize = 256; //configurable block size

//needed number of blocks for the given problem size
constexpr int gridSize = round_up_to_nearest_multiple(N, blockSize);

float *a;
// allocate memory on the GPU
HIP_CHECK(hipMalloc(&a, sizeof(*a) * N));

std::cout << "Launching kernel." << std::endl;
example_kernel<<<dim3(gridSize), dim3(blockSize), 0/*example doesn't use shared memory*/, 0/*default stream*/>>>(a, N);
// make sure kernel execution is finished by synchronizing. The CPU can also
// execute other instructions during that time
HIP_CHECK(hipDeviceSynchronize());
std::cout << "Kernel execution finished." << std::endl;

HIP_CHECK(hipFree(a));
}
.. literalinclude:: ../tools/example_codes/calling_global_functions.hip
:start-after: // [sphinx-start]
:end-before: // [sphinx-end]
:language: cpp

Inline qualifiers
--------------------------------------------------------------------------------
Expand Down Expand Up @@ -321,28 +265,10 @@ launch has to specify the needed amount of ``extern`` shared memory in the launc
configuration. The statically allocated shared memory is allocated without this
parameter.

.. code-block:: cpp

#include <hip/hip_runtime.h>

extern __shared__ int shared_array[];

__global__ void kernel(){
// initialize shared memory
shared_array[threadIdx.x] = threadIdx.x;
// use shared memory - synchronize to make sure, that all threads of the
// block see all changes to shared memory
__syncthreads();
}

int main(){
//shared memory in this case depends on the configurable block size
constexpr int blockSize = 256;
constexpr int sharedMemSize = blockSize * sizeof(int);
constexpr int gridSize = 2;

kernel<<<dim3(gridSize), dim3(blockSize), sharedMemSize, 0>>>();
}
.. literalinclude:: ../tools/example_codes/extern_shared_memory.hip
:start-after: // [sphinx-start]
:end-before: // [sphinx-end]
:language: cpp

__managed__
--------------------------------------------------------------------------------
Expand Down Expand Up @@ -735,22 +661,18 @@ with the actual frequency.

The difference between the returned values represents the cycles used.

.. code-block:: cpp

__global void kernel(){
long long int start = clock64();
// kernel code
long long int stop = clock64();
long long int cycles = stop - start;
}
.. literalinclude:: ../tools/example_codes/timer.hip
:start-after: // [sphinx-kernel-start]
:end-before: // [sphinx-kernel-end]
:language: cpp

``long long int wall_clock64()`` returns the wall clock time on the device, with a constant, fixed frequency.
The frequency is device dependent and can be queried using:

.. code-block:: cpp

int wallClkRate = 0; //in kilohertz
hipDeviceGetAttribute(&wallClkRate, hipDeviceAttributeWallClockRate, deviceId);
.. literalinclude:: ../tools/example_codes/timer.hip
:start-after: // [sphinx-query-start]
:end-before: // [sphinx-query-end]
:language: cpp

.. _atomic functions:

Expand Down
45 changes: 28 additions & 17 deletions docs/how-to/hip_rtc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,31 +319,42 @@ using the bitcode APIs provided by HIPRTC.
vector<char> kernel_bitcode(bitCodeSize);
hiprtcGetBitcode(prog, kernel_bitcode.data());

CU Mode vs WGP mode
CU mode vs WGP mode
-------------------------------------------------------------------------------

AMD GPUs consist of an array of workgroup processors, each built with 2 compute
units (CUs) capable of executing SIMD32. All the CUs inside a workgroup
processor use local data share (LDS).
All :doc:`supported AMD GPUs <rocm-install-on-linux:reference/system-requirements>` are built around a data-parallel
processor (DPP) array.

gfx10+ support execution of wavefront in CU mode and work-group processor mode
(WGP). Please refer to section 2.3 of `RDNA3 ISA reference <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_.
On CDNA GPUs, the DPP is organized as a set of compute unit (CU) pipelines, with each CU containing a single SIMD64
unit. Each CU has its own low-latency memory space called local data share (LDS), which threads from a warp running on
the CU can access.

gfx9 and below only supports CU mode.
On RDNA GPUs, the DPP is organized as a set of workgroup processor (WGP) pipelines. Each WGP contains two CUs, and each
CU contains two SIMD32 units. The LDS is attached to the WGP, so threads from different warps can access the same LDS if
they run on CUs within the same WGP.

In WGP mode, 4 warps of a block can simultaneously be executed on the workgroup
processor, where as in CU mode only 2 warps of a block can simultaneously
execute on a CU. In theory, WGP mode might help with occupancy and increase the
performance of certain HIP programs (if not bound to inter warp communication),
but might incur performance penalty on other HIP programs which rely on atomics
and inter warp communication. This also has effect of how the LDS is split
between warps, please refer to `RDNA3 ISA reference <https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf>`_ for more information.
.. note::

Because CDNA GPUs do not use workgroup processors and have a different CU layout, the following information applies
only to RDNA GPUs.

Warps are dispatched in one of two modes. These control whether warps are distributed across two SIMD32s (**CU mode**)
or across all four SIMD32s within a WGP (**WGP mode**).

CU mode executes two warps per block on a single CU and provides only half the LDS to those warps. Independence between
CUs can improve performance for workloads avoiding inter-warp communication, but LDS capacity per CU is limited.

WGP mode executes four warps per block on a WGP with a shared LDS. It can increase occupancy and improve performance
for workloads without heavy inter-warp communication, but it can degrade performance for programs relying on atomics or
extensive inter-warp communication.

For more information on the differences between CU and WGP modes, please refer to the appropriate ISA reference under
`AMD RDNA architecture <https://gpuopen.com/amd-gpu-architecture-programming-documentation/>`__.

.. note::

HIPRTC assumes **WGP mode by default** for gfx10+. This can be overridden by
passing ``-mcumode`` to HIPRTC compile options in
:cpp:func:`hiprtcCompileProgram`.
HIPRTC assumes **WGP mode by default** for RDNA GPUs. This can be overridden by passing ``-mcumode`` as a compile
option in :cpp:func:`hiprtcCompileProgram`.

Linker APIs
===============================================================================
Expand Down
Loading
Loading