IntelPython · oleksandr-pavlyk · Apr 15, 2021 · Apr 13, 2021 · Apr 14, 2021 · Apr 14, 2021
@@ -21,9 +21,9 @@ CC=clang CXX=dpcpp python setup.py build_ext --inplace
 #2 Running
 
 ```
-# SYCL_BE=PI_OPENCL sets SYCL backend to OpenCL to avoid a
+# SYCL_DEVICE_FILTER=opencl sets SYCL backend to OpenCL to avoid a
 # transient issue with MKL's using the default Level-0 backend
-(idp) [08:16:12 ansatnuc04 simple]$ SYCL_BE=PI_OPENCL ipython
+(idp) [08:16:12 ansatnuc04 simple]$ SYCL_DEVICE_FILTER=opencl ipython
 Python 3.7.7 (default, Jul 14 2020, 22:02:37)
 Type 'copyright', 'credits' or 'license' for more information
 IPython 7.17.0 -- An enhanced Interactive Python. Type '?' for help.
@@ -67,7 +67,7 @@ Times for NumPy
 Running run.py:
 
 ```
-(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_BE=PI_OPENCL python run.py
+(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_DEVICE_FILTER=opencl python run.py
 Result computed by NumPy
 [  0.27170187 -23.36798583   7.31326489  -1.95121928]
 Result computed by SYCL extension

@@ -24,14 +24,19 @@ cdef extern from "use_sycl_buffer.h":
     int c_columnwise_total(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil
     int c_columnwise_total_no_mkl(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil
 
-def columnwise_total(double[:, ::1] v, method='mkl'):
+def columnwise_total(double[:, ::1] v, method='mkl', queue=None):
     cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d')
     cdef double[::1] res_memslice = res_array
     cdef int ret_status
     cdef c_dpctl.SyclQueue q
     cdef c_dpctl.DPCTLSyclQueueRef q_ref
 
-    q = c_dpctl.get_current_queue()
+    if (queue is None):
+        q = c_dpctl.SyclQueue()
+    elif isinstance(queue, dpctl.SyclQueue):
+        q = <c_dpctl.SyclQueue> queue
+    else:
+        q = c_dpctl.SyclQueue(queue)
     q_ref = q.get_queue_ref()
 
     if method == 'mkl':

@@ -24,19 +24,19 @@
 print("=" * 10 + " Executing warm-up " + "=" * 10)
 print("NumPy result: ", X.sum(axis=0))
 
-dpctl.set_global_queue("opencl:cpu")
+q = dpctl.SyclQueue("opencl:cpu")
 print(
     "SYCL({}) result: {}".format(
-        dpctl.get_current_queue().sycl_device.name,
-        sb.columnwise_total(X),
+        q.sycl_device.name,
+        sb.columnwise_total(X, queue=q),
     )
 )
 
-dpctl.set_default_queue("opencl:gpu")
+q = dpctl.SyclQueue("opencl:gpu")
 print(
     "SYCL({}) result: {}".format(
-        dpctl.get_current_queue().sycl_device.name,
-        sb.columnwise_total(X),
+        q.sycl_device.name,
+        sb.columnwise_total(X, queue=q),
     )
 )
 
@@ -45,9 +45,9 @@
 print("Times for 'opencl:cpu'")
 print(
     timeit.repeat(
-        stmt="sb.columnwise_total(X)",
-        setup='dpctl.set_global_queue("opencl:cpu"); '
-        "sb.columnwise_total(X)",  # ensure JIT compilation is not counted
+        stmt="sb.columnwise_total(X, queue=q)",
+        setup='q = dpctl.SyclQueue("opencl:cpu"); '
+        "sb.columnwise_total(X, queue=q)",  # ensure JIT compilation is not counted
         number=100,
         globals=globals(),
     )
@@ -56,8 +56,8 @@
 print("Times for 'opencl:gpu'")
 print(
     timeit.repeat(
-        stmt="sb.columnwise_total(X)",
-        setup='dpctl.set_default_queue("opencl:gpu"); sb.columnwise_total(X)',
+        stmt="sb.columnwise_total(X, queue=q)",
+        setup='q = dpctl.SyclQueue("opencl:gpu"); sb.columnwise_total(X, queue=q)',
         number=100,
         globals=globals(),
     )

@@ -16,23 +16,24 @@
 
 import syclbuffer as sb
 import numpy as np
+import dpctl
 
 X = np.random.randn(100, 4)
 
 print("Result computed by NumPy")
 print(X.sum(axis=0))
-print("Result computed by SYCL extension")
+print("Result computed by SYCL extension using default offloading target")
 print(sb.columnwise_total(X))
 
 
 print("")
+
 # controlling where to offload
-import dpctl
 
-with dpctl.device_context("opencl:gpu"):
-    print("Running on: ", dpctl.get_current_queue().sycl_device.name)
-    print(sb.columnwise_total(X))
+q = dpctl.SyclQueue("opencl:gpu")
+print("Running on: ", q.sycl_device.name)
+print(sb.columnwise_total(X, queue=q))
 
-with dpctl.device_context("opencl:cpu"):
-    print("Running on: ", dpctl.get_current_queue().sycl_device.name)
-    print(sb.columnwise_total(X))
+q = dpctl.SyclQueue("opencl:cpu")
+print("Running on: ", q.sycl_device.name)
+print(sb.columnwise_total(X, queue=q))
@@ -26,7 +26,7 @@ To illustrate the queue creation overhead in each call, compare execution of def
 which is Intel Gen9 GPU on OpenCL backend:
 
 ```
-(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
+(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_DEVICE_FILTER=opencl:gpu python bench.py
 ========== Executing warm-up ==========
 NumPy result:  [1. 1. 1. ... 1. 1. 1.]
 SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
@@ -37,7 +37,7 @@ Times for NumPy
 [3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
 ```
 
-vs. timing when `dpctl`'s current queue is being reused:
+vs. timing when `dpctl`'s queue is being reused:
 
 ```
 (idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py

@@ -29,7 +29,18 @@ cdef extern from "sycl_blackscholes.hpp":
     cdef void cpp_blackscholes[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T* callput) except +
     cdef void cpp_populate_params[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, int seed) except +
 
-def black_scholes_price(floating[:, ::1] option_params):
+cdef c_dpctl.SyclQueue from_queue_keyword(queue):
+    if (queue is None):
+        return c_dpctl.SyclQueue()
+    elif isinstance(queue, dpctl.SyclQueue):
+        return <c_dpctl.SyclQueue> queue
+    else:
+        return c_dpctl.SyclQueue(queue)
+    # use default
+    return c_dpctl.SyclQueue()
+
+
+def black_scholes_price(floating[:, ::1] option_params, queue=None):
     cdef size_t n_opts = option_params.shape[0]
     cdef size_t n_params = option_params.shape[1]
     cdef size_t n_bytes = 0
@@ -49,19 +60,19 @@ def black_scholes_price(floating[:, ::1] option_params):
             "Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
             ).format(n_params))
 
-    q = c_dpctl.get_current_queue()
+    q = from_queue_keyword(queue)
     q_ptr = q.get_queue_ref()
     if (floating is double):
         n_bytes = 2*n_opts * sizeof(double)
-        mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
+        mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
         callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='d')
         call_put_prices = callput_arr
         dp1 = &option_params[0,0]
         dp2 = &call_put_prices[0,0];
         cpp_blackscholes[double](q_ptr, n_opts, dp1, dp2)
     elif (floating is float):
         n_bytes = 2*n_opts * sizeof(float)
-        mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
+        mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
         callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='f')
         call_put_prices = callput_arr
         fp1 = &option_params[0,0]
@@ -70,7 +81,7 @@ def black_scholes_price(floating[:, ::1] option_params):
 
     return callput_arr
 
-def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed):
+def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed, queue=None):
     cdef size_t n_opts = option_params.shape[0]
     cdef size_t n_params = option_params.shape[1]
 
@@ -85,7 +96,7 @@ def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl,
             "Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
             ).format(n_params))
 
-    q = c_dpctl.get_current_queue()
+    q = from_queue_keyword(queue)
     q_ptr = q.get_queue_ref()
     if (floating is double):
         dp = &option_params[0,0]

@@ -21,12 +21,16 @@
 from reference_black_scholes import ref_python_black_scholes
 
 
-def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
-    usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize)
-    # usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize)
+def gen_option_params(
+    n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None
+):
+    nbytes = n_opts * 5 * np.dtype(dtype).itemsize
+    usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue)
     params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype)
     seed = 1234
-    bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
+    bs.populate_params(
+        params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue
+    )
     return params
 
 
@@ -47,38 +51,44 @@ def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
 # compute prices in CPython
 X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d")
 
-print(np.allclose(Xgpu, X_ref, atol=1e-5))
+print(
+    "Correctness check: allclose(Xgpu, Xref) == ", np.allclose(Xgpu, X_ref, atol=1e-5)
+)
 
 n_opts = 3 * 10 ** 6
 
 # compute on CPU sycl device
 import timeit
 
-for _ in range(3):
+cpu_q = dpctl.SyclQueue("opencl:cpu:0")
+opts1 = gen_option_params(
+    n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=cpu_q
+)
+
+gpu_q = dpctl.SyclQueue("level_zero:gpu:0")
+opts2 = gen_option_params(
+    n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=gpu_q
+)
 
-    dpctl.set_global_queue("opencl:cpu:0")
-    print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))
+cpu_times = []
+gpu_times = []
+for _ in range(5):
 
     t0 = timeit.default_timer()
-    opts1 = gen_option_params(
-        n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
-    )
-    X1 = bs.black_scholes_price(opts1)
+    X1 = bs.black_scholes_price(opts1, queue=cpu_q)
     t1 = timeit.default_timer()
 
-    print("Elapsed: {}".format(t1 - t0))
+    cpu_times.append(t1 - t0)
 
     # compute on GPU sycl device
-    dpctl.set_global_queue("level_zero:gpu:0")
-    print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))
 
     t0 = timeit.default_timer()
-    opts2 = gen_option_params(
-        n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
-    )
-    X2 = bs.black_scholes_price(opts2)
+    X2 = bs.black_scholes_price(opts2, queue=gpu_q)
     t1 = timeit.default_timer()
-    print("Elapsed: {}".format(t1 - t0))
+    gpu_times.append(t1 - t0)
+
+print("Using      : {}".format(cpu_q.sycl_device.name))
+print("Wall times : {}".format(cpu_times))
 
-print(np.abs(opts1 - opts2).max())
-print(np.abs(X2 - X1).max())
+print("Using      : {}".format(gpu_q.sycl_device.name))
+print("Wall times : {}".format(gpu_times))
@@ -0,0 +1,83 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import inspect
+
+
+def has_nondefault_params(sgn):
+    for v in sgn.parameters.values():
+        if v.default is inspect._empty:
+            return True
+    return False
+
+
+def run_examples(example_description, glbls_dict):
+    parser = argparse.ArgumentParser(
+        description=example_description,
+    )
+    parser.add_argument(
+        "-r",
+        "--run",
+        type=str,
+        help="Functions to execute. Use --run all to run all of them.",
+    )
+    parser.add_argument(
+        "-l", "--list", action="store_true", help="List available function names to run"
+    )
+    parser.add_argument(
+        "-q", "--quiet", action="store_true", help="Do not echo example name."
+    )
+    args = parser.parse_args()
+
+    if args.list or not args.run:
+        fns = []
+        for n in glbls_dict:
+            if inspect.isfunction(glbls_dict.get(n)):
+                fns.append(n)
+        if fns:
+            print("Available examples:")
+            print(", ".join(fns))
+        else:
+            print("No examples are availble.")
+        exit(0)
+    if args.run == "all":
+        fns = []
+        for n in glbls_dict:
+            if inspect.isfunction(glbls_dict.get(n)):
+                fns.append(n)
+        args.run = fns
+    else:
+        args.run = args.run.split()
+
+    if args.run:
+        for fn in args.run:
+            if fn in glbls_dict:
+                clbl = glbls_dict.get(fn)
+                sgn = inspect.signature(clbl)
+                print("")
+                if has_nondefault_params(sgn):
+                    if not args.quiet:
+                        print(f"INFO: Skip exectution of {fn} as it requires arguments")
+                else:
+                    if not args.quiet:
+                        print(f"INFO: Executing example {fn}")
+                    clbl()
+                    if not args.quiet:
+                        print("INFO: ===========================")
+
+    else:
+        raise ValueError("No function to run was specified")