Merge remote-tracking branch 'origin/main' into fix_header

ahehn-nv · ahehn-nv · commit 7251ce6bd8ef · 2022-04-15T09:36:43.000-07:00
diff --git a/python/README.md b/python/README.md
@@ -71,7 +71,8 @@ Runtime dependencies of the cuQuantum Python package include:
 
 If you install everything from conda-forge, the dependencies are taken care for you (except for the driver).
 
-If you install the pip wheels, cuTENSOR and cuQuantum are installed for you (but not CUDA Toolkit or the driver).
+If you install the pip wheels, cuTENSOR and cuQuantum (but not CUDA Toolkit or the driver,
+please make sure the CUDA libraries are discoverable through your `LD_LIBRARY_PATH`) are installed for you.
 
 If you build cuQuantum Python from source, please make sure the paths to the cuQuantum and cuTENSOR libraries are added
 to your `LD_LIBRARY_PATH` environment variable.
diff --git a/python/samples/tensornet_example.py b/python/samples/tensornet_example.py
@@ -208,6 +208,16 @@
 
 print("Contract the network, each slice uses the same contraction plan.")
 
+# recall that we set strides to null (0), so the data are in F-contiguous layout
+A_d = A_d.reshape(extentA, order='F')
+B_d = B_d.reshape(extentB, order='F')
+C_d = C_d.reshape(extentC, order='F')
+D_d = D_d.reshape(extentD, order='F')
+out = cp.einsum("mhkn,ukh,xuy->mxny", A_d, B_d, C_d)
+if not cp.allclose(out, D_d):
+    raise RuntimeError("result is incorrect")
+print("Check cuTensorNet result against that of cupy.einsum().")
+
 #######################################################
 
 flops_dtype = cutn.contraction_optimizer_info_get_attribute_dtype(
diff --git a/python/setup.py b/python/setup.py
@@ -96,6 +96,7 @@
     'numpy',
     # 'cupy', # <-- can't be listed here as on PyPI this is the name for source build, not for wheel
     # 'torch', # <-- PyTorch is optional; also, it does not live on PyPI...
+    'typing_extensions',
     ]
 ignore_cuquantum_dep = bool(os.environ.get('CUQUANTUM_IGNORE_SOLVER', False))
 if not ignore_cuquantum_dep:
diff --git a/samples/cutensornet/tensornet_example.cu b/samples/cutensornet/tensornet_example.cu
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */  
 
+// Sphinx: #1
 #include <stdlib.h>
 #include <stdio.h>
 
@@ -29,11 +30,10 @@ if( err != CUTENSORNET_STATUS_SUCCESS )                                \
 
 struct GPUTimer
 {
-   GPUTimer()
+   GPUTimer(cudaStream_t stream): stream_(stream)
    {
       cudaEventCreate(&start_);
       cudaEventCreate(&stop_);
-      cudaEventRecord(start_, 0);
    }
 
    ~GPUTimer()
@@ -44,19 +44,21 @@ struct GPUTimer
 
    void start()
    {
-      cudaEventRecord(start_, 0);
+      cudaEventRecord(start_, stream_);
    }
 
    float seconds()
    {
-      cudaEventRecord(stop_, 0);
+      cudaEventRecord(stop_, stream_);
       cudaEventSynchronize(stop_);
       float time;
       cudaEventElapsedTime(&time, start_, stop_);
       return time * 1e-3;
    }
+
    private:
    cudaEvent_t start_, stop_;
+   cudaStream_t stream_;
 };
 
 
@@ -80,12 +82,12 @@ int main()
    printf("========================\n");
 
    typedef float floatType;
-
    cudaDataType_t typeData = CUDA_R_32F;
    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
 
    printf("Include headers and define data types\n");
 
+   // Sphinx: #2
    /**********************
    * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
    **********************/
@@ -124,6 +126,7 @@ int main()
 
    printf("Define network, modes, and extents\n");
 
+   // Sphinx: #3
    /**********************
    * Allocating data
    **********************/
@@ -182,18 +185,20 @@ int main()
    *******************/
 
    for (uint64_t i = 0; i < elementsA; i++)
-      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+      A[i] = ((float) rand())/RAND_MAX;
    for (uint64_t i = 0; i < elementsB; i++)
-      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+      B[i] = ((float) rand())/RAND_MAX;
    for (uint64_t i = 0; i < elementsC; i++)
-      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+      C[i] = ((float) rand())/RAND_MAX;
+   memset(D, 0, sizeof(floatType) * elementsD);
 
    HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
    HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
    HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
 
    printf("Allocate memory for data and workspace, and initialize data.\n");
 
+   // Sphinx: #4
    /*************************
    * cuTensorNet
    *************************/
@@ -247,6 +252,7 @@ int main()
 
    printf("Initialize the cuTensorNet library and create a network descriptor.\n");
 
+   // Sphinx: #5
    /*******************************
    * Find "optimal" contraction order and slicing
    *******************************/
@@ -284,6 +290,7 @@ int main()
 
    printf("Find an optimized contraction path with cuTensorNet optimizer.\n");
 
+   // Sphinx: #6
    /*******************************
    * Initialize all pair-wise contraction plans (for cuTENSOR)
    *******************************/
@@ -349,10 +356,11 @@ int main()
 
       printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
 
+      // Sphinx: #7
       /**********************
       * Run
       **********************/
-      GPUTimer timer;
+      GPUTimer timer{stream};
       double minTimeCUTENSOR = 1e100;
       const int numRuns = 3; // to get stable perf results
       for (int i=0; i < numRuns; ++i)
@@ -364,6 +372,9 @@ int main()
          * Contract over all slices.
          *
          * A user may choose to parallelize this loop across multiple devices.
+         * (Note, however, that as of cuTensorNet v1.0.0 the contraction must
+         * start from slice 0, see the cutensornetContraction documentation at
+         * https://docs.nvidia.com/cuda/cuquantum/cutensornet/api/functions.html#cutensornetcontraction )
          */
          for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
          {

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@`
`96`	`96`	`'numpy',`
`97`	`97`	`# 'cupy', # <-- can't be listed here as on PyPI this is the name for source build, not for wheel`
`98`	`98`	`# 'torch', # <-- PyTorch is optional; also, it does not live on PyPI...`
	`99`	`+ 'typing_extensions',`
`99`	`100`	`]`
`100`	`101`	`ignore_cuquantum_dep = bool(os.environ.get('CUQUANTUM_IGNORE_SOLVER', False))`
`101`	`102`	`if not ignore_cuquantum_dep:`