NVIDIA
diff --git a/‎python/README.md‎
Lines changed: 14 additions & 7 deletions b/‎python/README.md‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎python/cuquantum/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/cuquantum/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/cuquantum/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎python/cuquantum/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/cuquantum/custatevec/custatevec.pxd‎
Lines changed: 10 additions & 10 deletions b/‎python/cuquantum/custatevec/custatevec.pxd‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎python/cuquantum/custatevec/custatevec.pyx‎
Lines changed: 139 additions & 19 deletions b/‎python/cuquantum/custatevec/custatevec.pyx‎
Lines changed: 139 additions & 19 deletions
@@ -4,6 +4,7 @@
 
 Please visit the [NVIDIA cuQuantum Python documentation](https://docs.nvidia.com/cuda/cuquantum/python).
 
+
 ## Building
 
 ### Requirements
@@ -12,8 +13,9 @@ Build-time dependencies of the cuQuantum Python package and some versions that
 are known to work are as follows:
 
 * CUDA Toolkit 11.x
-* cuQuantum 22.03+
+* cuQuantum 22.07+
 * cuTENSOR 1.5.0+
+* Python 3.8+
 * Cython - e.g. 0.29.21
 * [packaging](https://packaging.pypa.io/en/latest/)
 
@@ -62,14 +64,14 @@ Runtime dependencies of the cuQuantum Python package include:
 * An NVIDIA GPU with compute capability 7.0+
 * Driver: Linux (450.80.02+)
 * CUDA Toolkit 11.x
-* cuQuantum 22.03+
+* cuQuantum 22.07+
 * cuTENSOR 1.5.0+
-* NumPy v1.17+
+* Python 3.8+
+* NumPy v1.19+
 * CuPy v9.5.0+ (see [installation guide](https://docs.cupy.dev/en/stable/install.html))
 * PyTorch v1.10+ (optional, see [installation guide](https://pytorch.org/get-started/locally/))
-* qiskit v0.24.0+ (optional, see [installation guide](https://qiskit.org/documentation/getting_started.html))
-* cirq v0.6.0+ (optional, see [installation guide](https://quantumai.google/cirq/install))
-* [typing-extensions](https://pypi.org/project/typing-extensions/)
+* Qiskit v0.24.0+ (optional, see [installation guide](https://qiskit.org/documentation/getting_started.html))
+* Cirq v0.6.0+ (optional, see [installation guide](https://quantumai.google/cirq/install))
 
 If you install everything from conda-forge, the dependencies are taken care for you (except for the driver).
 
@@ -93,6 +95,11 @@ library in Python.
 
 ## Testing
 
-If pytest is installed, run `pytest tests` in the Python source root directory would
+If pytest is installed, typing `pytest tests` at the command prompt in the Python source root directory will
 run all tests. Some tests would be skipped if `cffi` is not installed or if the environment
 variable `CUDA_PATH` is not set.
+
+
+## Citing cuQuantum
+
+Pleae click this Zenodo badge to see the citation format: [![DOI](https://zenodo.org/badge/435003852.svg)](https://zenodo.org/badge/latestdoi/435003852)
@@ -18,13 +18,15 @@
         custatevec.MatrixType,
         custatevec.Collapse,
         custatevec.SamplerOutput,
+        custatevec.DeviceNetworkType,
         cutensornet.ContractionOptimizerInfoAttribute,
         cutensornet.ContractionOptimizerConfigAttribute,
         cutensornet.ContractionAutotunePreferenceAttribute,
         cutensornet.WorksizePref,
         cutensornet.Memspace,
         cutensornet.GraphAlgo,
         cutensornet.MemoryModel,
+        cutensornet.OptimizerCost,
         ):
     cutensornet._internal.enum_utils.add_enum_class_doc(enum, chomp="_ATTRIBUTE|_PREFERENCE_ATTRIBUTE")
 
 
@@ -5,4 +5,4 @@
 # Note: cuQuantum Python follows the cuQuantum SDK version, which is now
 # switched to YY.MM and is different from individual libraries' (semantic)
 # versioning scheme.
-__version__ = '22.07a0'  # the last digit is for cuQuantum Python only
+__version__ = '22.07.0'  # the last digit is for cuQuantum Python only
@@ -2,20 +2,15 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t
-
-from cuquantum.utils cimport Stream
-
-
 # The C types are prefixed with an underscore because we are not
 # yet protected by the module namespaces as done in CUDA Python.
 # Once we switch over the names would be prettier (in the Cython
 # layer).
 
+from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t
 
-# Cython limitation: need standalone typedef if we wanna use it for casting
-ctypedef int (*DeviceAllocType)(void*, void**, size_t, Stream)
-ctypedef int (*DeviceFreeType)(void*, void*, size_t, Stream)
+from cuquantum.utils cimport (DataType, DeviceAllocType, DeviceFreeType, int2,
+                              LibPropType, Stream)
 
 
 cdef extern from '<custatevec.h>' nogil:
@@ -25,8 +20,6 @@ cdef extern from '<custatevec.h>' nogil:
     ctypedef int _Status 'custatevecStatus_t'
     ctypedef void* _SamplerDescriptor 'custatevecSamplerDescriptor_t'
     ctypedef void* _AccessorDescriptor 'custatevecAccessorDescriptor_t'
-    ctypedef enum _ComputeType 'custatevecComputeType_t':
-        pass
     ctypedef struct _DeviceMemHandler 'custatevecDeviceMemHandler_t':
         void* ctx
         DeviceAllocType device_alloc
@@ -42,6 +35,9 @@ cdef extern from '<custatevec.h>' nogil:
         void* userData)
 
     # cuStateVec enums
+    ctypedef enum _ComputeType 'custatevecComputeType_t':
+        pass
+
     ctypedef enum _Pauli 'custatevecPauli_t':
         CUSTATEVEC_PAULI_I
         CUSTATEVEC_PAULI_X
@@ -65,6 +61,10 @@ cdef extern from '<custatevec.h>' nogil:
         CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER
         CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER
 
+    ctypedef enum _DeviceNetworkType 'custatevecDeviceNetworkType_t':
+        CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH
+        CUSTATEVEC_DEVICE_NETWORK_TYPE_FULLMESH
+
     # cuStateVec consts
     int CUSTATEVEC_VER_MAJOR
     int CUSTATEVEC_VER_MINOR
 
@@ -20,13 +20,6 @@ import numpy as _numpy
 
 
 cdef extern from * nogil:
-    # from CUDA
-    ctypedef enum DataType 'cudaDataType_t':
-        pass
-    ctypedef enum LibPropType 'libraryPropertyType':
-        pass
-    ctypedef struct int2 'int2':
-        pass
 
     # cuStateVec functions
     int custatevecCreate(_Handle*)
@@ -127,6 +120,11 @@ cdef extern from * nogil:
     int custatevecSwapIndexBits(
         _Handle, void*, DataType, const uint32_t, const int2*, const uint32_t,
         const int32_t*, const int32_t*, const uint32_t)
+    int custatevecMultiDeviceSwapIndexBits(
+        _Handle*, const uint32_t, void**, const DataType, const uint32_t,
+        const uint32_t, const int2*, const uint32_t,
+        const int32_t*, const int32_t*, const uint32_t,
+        const _DeviceNetworkType)
     int custatevecTestMatrixTypeGetWorkspaceSize(
         _Handle, _MatrixType, const void*, DataType, _MatrixLayout,
         const uint32_t, const int32_t, _ComputeType, size_t*)
@@ -377,11 +375,10 @@ cpdef abs2sum_array(
             - a Python sequence of index bit ordering
 
         bit_ordering_len (uint32_t): The length of ``bit_ordering``.
-        mask_bit_string: A host array for a bit string to specify mask. It can
-            be
+        mask_bit_string: A host array for specifying mask values. It can be
 
             - an :class:`int` as the pointer address to the array
-            - a Python sequence of index bit ordering
+            - a Python sequence of mask values
 
         mask_ordering: A host array of mask ordering. It can be
 
@@ -1622,11 +1619,10 @@ cpdef swap_index_bits(
             - a nested Python sequence of swapped index bits
 
         n_swapped_bits (uint32_t): The number of pairs of swapped index bits.
-        mask_bit_string: A host array for a bit string to specify mask. It can
-            be
+        mask_bit_string: A host array for specifying mask values. It can be
 
             - an :class:`int` as the pointer address to the array
-            - a Python sequence of index bit ordering
+            - a Python sequence of mask values
 
         mask_ordering: A host array of mask ordering. It can be
 
@@ -1690,6 +1686,125 @@ cpdef swap_index_bits(
     check_status(status)
 
 
+cpdef multi_device_swap_index_bits(
+        handles, uint32_t n_handles, sub_svs, int sv_data_type,
+        uint32_t n_global_index_bits, uint32_t n_local_index_bits,
+        swapped_bits, uint32_t n_swapped_bits,
+        mask_bit_string, mask_ordering, uint32_t mask_len,
+        int device_network_type):
+    """Swap index bits and reorder statevector elements on multiple devices.
+
+    Args:
+        handles: A host array of the library handles. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`, each of which is a valid
+              library handle
+
+        n_handles (uint32_t): The number of handles.
+        sub_svs: A host array of the sub-statevector pointers. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`, each of which is a valid
+              sub-statevector pointer (on device)
+
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevectors.
+        n_global_index_bits (uint32_t): The number of the global index bits.
+        n_local_index_bits (uint32_t): The number of the local index bits.
+        swapped_bits: A host array of pairs of swapped index bits. It can be
+
+            - an :class:`int` as the pointer address to the nested sequence
+            - a nested Python sequence of swapped index bits
+
+        n_swapped_bits (uint32_t): The number of pairs of swapped index bits.
+        mask_bit_string: A host array for specifying mask values. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of mask values
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+        device_network_type (DeviceNetworkType): The device network topology.
+
+    .. seealso:: `custatevecMultiDeviceSwapIndexBits`
+    """
+    # handles can be a pointer address, or a Python sequence
+    cdef vector[intptr_t] handlesData
+    cdef _Handle* handlesPtr
+    if cpython.PySequence_Check(handles):
+        handlesData = handles
+        handlesPtr = <_Handle*>handlesData.data()
+    else:  # a pointer address
+        handlesPtr = <_Handle*><intptr_t>handles
+
+    # sub_svs can be a pointer address, or a Python sequence
+    cdef vector[intptr_t] subSVsData
+    cdef void** subSVsPtr
+    if cpython.PySequence_Check(sub_svs):
+        subSVsData = sub_svs
+        subSVsPtr = <void**>subSVsData.data()
+    else:  # a pointer address
+        subSVsPtr = <void**><intptr_t>sub_svs
+
+    # swapped_bits can be:
+    #   - a plain pointer address
+    #   - a nested Python sequence (ex: a list of 2-tuples)
+    # Note: it cannot be a mix of sequences and ints. It also cannot be a
+    # 1D sequence (of ints), because it's inefficient.
+    cdef vector[intptr_t] swappedBitsCData
+    cdef int2* swappedBitsPtr
+    if is_nested_sequence(swapped_bits):
+        try:
+            # direct conversion
+            data = _numpy.asarray(swapped_bits, dtype=_numpy.int32)
+            data = data.reshape(-1)
+        except:
+            # unlikely, but let's do it in the stupid way
+            data = _numpy.empty(2*n_swapped_bits, dtype=_numpy.int32)
+            for i, (first, second) in enumerate(swapped_bits):
+                data[2*i] = first
+                data[2*i+1] = second
+        assert data.size == 2*n_swapped_bits
+        swappedBitsPtr = <int2*>(<intptr_t>data.ctypes.data)
+    elif isinstance(swapped_bits, int):
+        # a pointer address, take it as is
+        swappedBitsPtr = <int2*><intptr_t>swapped_bits
+    else:
+        raise ValueError("swapped_bits is provided in an "
+                         "un-recognized format")
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecMultiDeviceSwapIndexBits(
+            handlesPtr, n_handles, subSVsPtr, <DataType>sv_data_type,
+            n_global_index_bits, n_local_index_bits,
+            swappedBitsPtr, n_swapped_bits,
+            maskBitStringPtr, maskOrderingPtr, mask_len,
+            <_DeviceNetworkType>device_network_type)
+    check_status(status)
+
+
 cpdef size_t test_matrix_type_get_workspace_size(
         intptr_t handle, int matrix_type,
         intptr_t matrix, int matrix_data_type, int layout, uint32_t n_targets,
@@ -1698,9 +1813,9 @@ cpdef size_t test_matrix_type_get_workspace_size(
 
     Args:
         handle (intptr_t): The library handle.
-        matrix_type (cuquantum.MatrixType): The matrix type of the gate matrix.
-        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
-            (on either host or device).
+        matrix_type (MatrixType): The matrix type of the gate matrix.
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a
+            matrix (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
         n_targets (uint32_t): The length of ``targets``.
@@ -1733,9 +1848,9 @@ cpdef double test_matrix_type(
 
     Args:
         handle (intptr_t): The library handle.
-        matrix_type (cuquantum.MatrixType): The matrix type of the gate matrix.
-        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
-            (on either host or device).
+        matrix_type (MatrixType): The matrix type of the gate matrix.
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a
+            matrix (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
         n_targets (uint32_t): The length of ``targets``.
@@ -1977,6 +2092,11 @@ class SamplerOutput(IntEnum):
     RANDNUM_ORDER = CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER
     ASCENDING_ORDER = CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER
 
+class DeviceNetworkType(IntEnum):
+    """See `custatevecDeviceNetworkType_t`."""
+    SWITCH = CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH
+    FULLMESH = CUSTATEVEC_DEVICE_NETWORK_TYPE_FULLMESH
+
 
 del IntEnum