Skip to content
Merged
6 changes: 3 additions & 3 deletions examples/cython/sycl_buffer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ CC=clang CXX=dpcpp python setup.py build_ext --inplace
#2 Running

```
# SYCL_BE=PI_OPENCL sets SYCL backend to OpenCL to avoid a
# SYCL_DEVICE_FILTER=opencl sets SYCL backend to OpenCL to avoid a
# transient issue with MKL's using the default Level-0 backend
(idp) [08:16:12 ansatnuc04 simple]$ SYCL_BE=PI_OPENCL ipython
(idp) [08:16:12 ansatnuc04 simple]$ SYCL_DEVICE_FILTER=opencl ipython
Python 3.7.7 (default, Jul 14 2020, 22:02:37)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.17.0 -- An enhanced Interactive Python. Type '?' for help.
Expand Down Expand Up @@ -67,7 +67,7 @@ Times for NumPy
Running run.py:

```
(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_BE=PI_OPENCL python run.py
(idp) [09:14:53 ansatnuc04 sycl_buffer]$ SYCL_DEVICE_FILTER=opencl python run.py
Result computed by NumPy
[ 0.27170187 -23.36798583 7.31326489 -1.95121928]
Result computed by SYCL extension
Expand Down
9 changes: 7 additions & 2 deletions examples/cython/sycl_buffer/_buffer_example.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,19 @@ cdef extern from "use_sycl_buffer.h":
int c_columnwise_total(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil
int c_columnwise_total_no_mkl(c_dpctl.DPCTLSyclQueueRef q, size_t n, size_t m, double *m, double *ct) nogil

def columnwise_total(double[:, ::1] v, method='mkl'):
def columnwise_total(double[:, ::1] v, method='mkl', queue=None):
cdef cnp.ndarray res_array = np.empty((v.shape[1],), dtype='d')
cdef double[::1] res_memslice = res_array
cdef int ret_status
cdef c_dpctl.SyclQueue q
cdef c_dpctl.DPCTLSyclQueueRef q_ref

q = c_dpctl.get_current_queue()
if (queue is None):
q = c_dpctl.SyclQueue()
elif isinstance(queue, dpctl.SyclQueue):
q = <c_dpctl.SyclQueue> queue
else:
q = c_dpctl.SyclQueue(queue)
q_ref = q.get_queue_ref()

if method == 'mkl':
Expand Down
22 changes: 11 additions & 11 deletions examples/cython/sycl_buffer/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@
print("=" * 10 + " Executing warm-up " + "=" * 10)
print("NumPy result: ", X.sum(axis=0))

dpctl.set_global_queue("opencl:cpu")
q = dpctl.SyclQueue("opencl:cpu")
print(
"SYCL({}) result: {}".format(
dpctl.get_current_queue().sycl_device.name,
sb.columnwise_total(X),
q.sycl_device.name,
sb.columnwise_total(X, queue=q),
)
)

dpctl.set_default_queue("opencl:gpu")
q = dpctl.SyclQueue("opencl:gpu")
print(
"SYCL({}) result: {}".format(
dpctl.get_current_queue().sycl_device.name,
sb.columnwise_total(X),
q.sycl_device.name,
sb.columnwise_total(X, queue=q),
)
)

Expand All @@ -45,9 +45,9 @@
print("Times for 'opencl:cpu'")
print(
timeit.repeat(
stmt="sb.columnwise_total(X)",
setup='dpctl.set_global_queue("opencl:cpu"); '
"sb.columnwise_total(X)", # ensure JIT compilation is not counted
stmt="sb.columnwise_total(X, queue=q)",
setup='q = dpctl.SyclQueue("opencl:cpu"); '
"sb.columnwise_total(X, queue=q)", # ensure JIT compilation is not counted
number=100,
globals=globals(),
)
Expand All @@ -56,8 +56,8 @@
print("Times for 'opencl:gpu'")
print(
timeit.repeat(
stmt="sb.columnwise_total(X)",
setup='dpctl.set_default_queue("opencl:gpu"); sb.columnwise_total(X)',
stmt="sb.columnwise_total(X, queue=q)",
setup='q = dpctl.SyclQueue("opencl:gpu"); sb.columnwise_total(X, queue=q)',
number=100,
globals=globals(),
)
Expand Down
17 changes: 9 additions & 8 deletions examples/cython/sycl_buffer/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,24 @@

import syclbuffer as sb
import numpy as np
import dpctl

X = np.random.randn(100, 4)

print("Result computed by NumPy")
print(X.sum(axis=0))
print("Result computed by SYCL extension")
print("Result computed by SYCL extension using default offloading target")
print(sb.columnwise_total(X))


print("")

# controlling where to offload
import dpctl

with dpctl.device_context("opencl:gpu"):
print("Running on: ", dpctl.get_current_queue().sycl_device.name)
print(sb.columnwise_total(X))
q = dpctl.SyclQueue("opencl:gpu")
print("Running on: ", q.sycl_device.name)
print(sb.columnwise_total(X, queue=q))

with dpctl.device_context("opencl:cpu"):
print("Running on: ", dpctl.get_current_queue().sycl_device.name)
print(sb.columnwise_total(X))
q = dpctl.SyclQueue("opencl:cpu")
print("Running on: ", q.sycl_device.name)
print(sb.columnwise_total(X, queue=q))
4 changes: 2 additions & 2 deletions examples/cython/sycl_direct_linkage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ To illustrate the queue creation overhead in each call, compare execution of def
which is Intel Gen9 GPU on OpenCL backend:

```
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_DEVICE_FILTER=opencl:gpu python bench.py
========== Executing warm-up ==========
NumPy result: [1. 1. 1. ... 1. 1. 1.]
SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
Expand All @@ -37,7 +37,7 @@ Times for NumPy
[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
```

vs. timing when `dpctl`'s current queue is being reused:
vs. timing when `dpctl`'s queue is being reused:

```
(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
Expand Down
23 changes: 17 additions & 6 deletions examples/cython/usm_memory/blackscholes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,18 @@ cdef extern from "sycl_blackscholes.hpp":
cdef void cpp_blackscholes[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T* callput) except +
cdef void cpp_populate_params[T](c_dpctl.DPCTLSyclQueueRef, size_t n_opts, T* option_params, T pl, T ph, T sl, T sh, T tl, T th, T rl, T rh, T vl, T vh, int seed) except +

def black_scholes_price(floating[:, ::1] option_params):
cdef c_dpctl.SyclQueue from_queue_keyword(queue):
if (queue is None):
return c_dpctl.SyclQueue()
elif isinstance(queue, dpctl.SyclQueue):
return <c_dpctl.SyclQueue> queue
else:
return c_dpctl.SyclQueue(queue)
# use default
return c_dpctl.SyclQueue()


def black_scholes_price(floating[:, ::1] option_params, queue=None):
cdef size_t n_opts = option_params.shape[0]
cdef size_t n_params = option_params.shape[1]
cdef size_t n_bytes = 0
Expand All @@ -49,19 +60,19 @@ def black_scholes_price(floating[:, ::1] option_params):
"Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
).format(n_params))

q = c_dpctl.get_current_queue()
q = from_queue_keyword(queue)
q_ptr = q.get_queue_ref()
if (floating is double):
n_bytes = 2*n_opts * sizeof(double)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='d')
call_put_prices = callput_arr
dp1 = &option_params[0,0]
dp2 = &call_put_prices[0,0];
cpp_blackscholes[double](q_ptr, n_opts, dp1, dp2)
elif (floating is float):
n_bytes = 2*n_opts * sizeof(float)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes)
mobj = c_dpctl_mem.MemoryUSMShared(n_bytes, queue=q)
callput_arr = np.ndarray((n_opts, 2), buffer=mobj, dtype='f')
call_put_prices = callput_arr
fp1 = &option_params[0,0]
Expand All @@ -70,7 +81,7 @@ def black_scholes_price(floating[:, ::1] option_params):

return callput_arr

def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed):
def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, int seed, queue=None):
cdef size_t n_opts = option_params.shape[0]
cdef size_t n_params = option_params.shape[1]

Expand All @@ -85,7 +96,7 @@ def populate_params(floating[:, ::1] option_params, pl, ph, sl, sh, tl, th, rl,
"Each row must specify (current_price, strike_price, maturity, interest_rate, volatility)."
).format(n_params))

q = c_dpctl.get_current_queue()
q = from_queue_keyword(queue)
q_ptr = q.get_queue_ref()
if (floating is double):
dp = &option_params[0,0]
Expand Down
54 changes: 32 additions & 22 deletions examples/cython/usm_memory/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@
from reference_black_scholes import ref_python_black_scholes


def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize)
# usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize)
def gen_option_params(
n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None
):
nbytes = n_opts * 5 * np.dtype(dtype).itemsize
usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue)
params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype)
seed = 1234
bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
bs.populate_params(
params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue
)
return params


Expand All @@ -47,38 +51,44 @@ def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
# compute prices in CPython
X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d")

print(np.allclose(Xgpu, X_ref, atol=1e-5))
print(
"Correctness check: allclose(Xgpu, Xref) == ", np.allclose(Xgpu, X_ref, atol=1e-5)
)

n_opts = 3 * 10 ** 6

# compute on CPU sycl device
import timeit

for _ in range(3):
cpu_q = dpctl.SyclQueue("opencl:cpu:0")
opts1 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=cpu_q
)

gpu_q = dpctl.SyclQueue("level_zero:gpu:0")
opts2 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d", queue=gpu_q
)

dpctl.set_global_queue("opencl:cpu:0")
print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))
cpu_times = []
gpu_times = []
for _ in range(5):

t0 = timeit.default_timer()
opts1 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
)
X1 = bs.black_scholes_price(opts1)
X1 = bs.black_scholes_price(opts1, queue=cpu_q)
t1 = timeit.default_timer()

print("Elapsed: {}".format(t1 - t0))
cpu_times.append(t1 - t0)

# compute on GPU sycl device
dpctl.set_global_queue("level_zero:gpu:0")
print("Using : {}".format(dpctl.get_current_queue().sycl_device.name))

t0 = timeit.default_timer()
opts2 = gen_option_params(
n_opts, 20.0, 30.0, 22.0, 29.0, 18.0, 24.0, 0.01, 0.05, 0.01, 0.05, "d"
)
X2 = bs.black_scholes_price(opts2)
X2 = bs.black_scholes_price(opts2, queue=gpu_q)
t1 = timeit.default_timer()
print("Elapsed: {}".format(t1 - t0))
gpu_times.append(t1 - t0)

print("Using : {}".format(cpu_q.sycl_device.name))
print("Wall times : {}".format(cpu_times))

print(np.abs(opts1 - opts2).max())
print(np.abs(X2 - X1).max())
print("Using : {}".format(gpu_q.sycl_device.name))
print("Wall times : {}".format(gpu_times))
83 changes: 83 additions & 0 deletions examples/python/_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Data Parallel Control (dpctl)
#
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import inspect


def has_nondefault_params(sgn):
for v in sgn.parameters.values():
if v.default is inspect._empty:
return True
return False


def run_examples(example_description, glbls_dict):
parser = argparse.ArgumentParser(
description=example_description,
)
parser.add_argument(
"-r",
"--run",
type=str,
help="Functions to execute. Use --run all to run all of them.",
)
parser.add_argument(
"-l", "--list", action="store_true", help="List available function names to run"
)
parser.add_argument(
"-q", "--quiet", action="store_true", help="Do not echo example name."
)
args = parser.parse_args()

if args.list or not args.run:
fns = []
for n in glbls_dict:
if inspect.isfunction(glbls_dict.get(n)):
fns.append(n)
if fns:
print("Available examples:")
print(", ".join(fns))
else:
print("No examples are availble.")
exit(0)
if args.run == "all":
fns = []
for n in glbls_dict:
if inspect.isfunction(glbls_dict.get(n)):
fns.append(n)
args.run = fns
else:
args.run = args.run.split()

if args.run:
for fn in args.run:
if fn in glbls_dict:
clbl = glbls_dict.get(fn)
sgn = inspect.signature(clbl)
print("")
if has_nondefault_params(sgn):
if not args.quiet:
print(f"INFO: Skip exectution of {fn} as it requires arguments")
else:
if not args.quiet:
print(f"INFO: Executing example {fn}")
clbl()
if not args.quiet:
print("INFO: ===========================")

else:
raise ValueError("No function to run was specified")
Loading