Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cuda_bindings/benchmarks/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ __pycache__/

# Override root .gitignore *.cpp rule (which targets Cython-generated files)
!benchmarks/cpp/*.cpp

results-python.json
results-cpp.json
37 changes: 27 additions & 10 deletions cuda_bindings/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
# cuda.bindings Benchmarks
# cuda.bindings benchmarks

These benchmarks are intended to measure the latency overhead of calling CUDA
Driver APIs through cuda.bindings, relative to a similar C++ baseline.

The goal is to benchmark how much overhead does the Python layer adds to calling
CUDA APIs and what operations are not in our target of less than 1us of overhead.

Each Python benchmark has a C++ counterpart, which is used to compare the
operations. We try to make each implementation perform small operations
and nearly the same work as possible and are run under similar conditions.

These are **not** throughput benchmarks to measure the overall performance
of kernels and applications.

## Usage

Expand Down Expand Up @@ -32,26 +45,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
To run the benchmarks combine the environment and task:

```bash

# Run the Python benchmarks in the wheel environment
pixi run -e wheel bench

# Run the Python benchmarks in the source environment
pixi run -e source bench

# Run the C++ benchmarks (environment is irrelavant here)
# Run the C++ benchmarks
pixi run -e wheel bench-cpp
```

## pyperf JSON
Both runners automatically save results to JSON files in the benchmarks
directory: `results-python.json` and `results-cpp.json`.

The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
The results are written to a JSON file in the format expected by pyperf.
## Output JSON and analysis

The C++ benchmarks also generate a valid JSON file, in the same format.
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
Both Python and C++ results are saved in pyperf-compatible JSON format,
which can be analyzed with pyperf commands:

```
pixi run -e wheel bench-cpp -0 cpp.json
```bash
# Show results and statistics
pixi run -e wheel -- python -m pyperf stats results-python.json
pixi run -e wheel -- python -m pyperf stats results-cpp.json

pixi run -e wheel pyperf stats cpp.json
# Compare C++ vs Python results
pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
```
62 changes: 62 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

CTX = ensure_context()

_, DEVICE = cuda.cuDeviceGet(0)
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR


def bench_ctx_get_current(loops: int) -> float:
_cuCtxGetCurrent = cuda.cuCtxGetCurrent

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetCurrent()
return time.perf_counter() - t0


def bench_ctx_set_current(loops: int) -> float:
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
_ctx = CTX

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxSetCurrent(_ctx)
return time.perf_counter() - t0


def bench_ctx_get_device(loops: int) -> float:
_cuCtxGetDevice = cuda.cuCtxGetDevice

t0 = time.perf_counter()
for _ in range(loops):
_cuCtxGetDevice()
return time.perf_counter() - t0


def bench_device_get(loops: int) -> float:
_cuDeviceGet = cuda.cuDeviceGet

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGet(0)
return time.perf_counter() - t0


def bench_device_get_attribute(loops: int) -> float:
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
_attr = ATTRIBUTE
_dev = DEVICE

t0 = time.perf_counter()
for _ in range(loops):
_cuDeviceGetAttribute(_attr, _dev)
return time.perf_counter() - t0
62 changes: 62 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

ensure_context()

_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)

cuda.cuEventRecord(EVENT, STREAM)
cuda.cuStreamSynchronize(STREAM)

EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value


def bench_event_create_destroy(loops: int) -> float:
_cuEventCreate = cuda.cuEventCreate
_cuEventDestroy = cuda.cuEventDestroy
_flags = EVENT_FLAGS

t0 = time.perf_counter()
for _ in range(loops):
_, e = _cuEventCreate(_flags)
_cuEventDestroy(e)
return time.perf_counter() - t0


def bench_event_record(loops: int) -> float:
_cuEventRecord = cuda.cuEventRecord
_event = EVENT
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuEventRecord(_event, _stream)
return time.perf_counter() - t0


def bench_event_query(loops: int) -> float:
_cuEventQuery = cuda.cuEventQuery
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventQuery(_event)
return time.perf_counter() - t0


def bench_event_synchronize(loops: int) -> float:
_cuEventSynchronize = cuda.cuEventSynchronize
_event = EVENT

t0 = time.perf_counter()
for _ in range(loops):
_cuEventSynchronize(_event)
return time.perf_counter() - t0
133 changes: 133 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_launch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import ctypes
import time

from runner.runtime import alloc_persistent, assert_drv, compile_and_load

from cuda.bindings import driver as cuda

# Compile kernels lazily so benchmark discovery does not need NVRTC.
KERNEL_SOURCE = """\
extern "C" __global__ void empty_kernel() { return; }
extern "C" __global__ void small_kernel(float *f) { *f = 0.0f; }

#define ITEM_PARAM(x, T) T x
#define REP1(x, T) , ITEM_PARAM(x, T)
#define REP2(x, T) REP1(x##0, T) REP1(x##1, T)
#define REP4(x, T) REP2(x##0, T) REP2(x##1, T)
#define REP8(x, T) REP4(x##0, T) REP4(x##1, T)
#define REP16(x, T) REP8(x##0, T) REP8(x##1, T)

extern "C" __global__
void small_kernel_16_args(
ITEM_PARAM(F, int*)
REP1(A, int*)
REP2(A, int*)
REP4(A, int*)
REP8(A, int*))
{ *F = 0; }
"""

MODULE = None
EMPTY_KERNEL = None
SMALL_KERNEL = None
KERNEL_16_ARGS = None
STREAM = None
FLOAT_PTR = None
INT_PTRS = None
_VAL_PS = None
PACKED_16 = None


def _ensure_launch_state() -> None:
global MODULE, EMPTY_KERNEL, SMALL_KERNEL, KERNEL_16_ARGS, STREAM
global FLOAT_PTR, INT_PTRS, _VAL_PS, PACKED_16

if EMPTY_KERNEL is not None:
return

module = compile_and_load(KERNEL_SOURCE)

err, empty_kernel = cuda.cuModuleGetFunction(module, b"empty_kernel")
assert_drv(err)
err, small_kernel = cuda.cuModuleGetFunction(module, b"small_kernel")
assert_drv(err)
err, kernel_16_args = cuda.cuModuleGetFunction(module, b"small_kernel_16_args")
assert_drv(err)

err, stream = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
assert_drv(err)

float_ptr = alloc_persistent(ctypes.sizeof(ctypes.c_float))
int_ptrs = tuple(alloc_persistent(ctypes.sizeof(ctypes.c_int)) for _ in range(16))

val_ps = [ctypes.c_void_p(int(ptr)) for ptr in int_ptrs]
packed_16 = (ctypes.c_void_p * 16)()
for index, value_ptr in enumerate(val_ps):
packed_16[index] = ctypes.addressof(value_ptr)

MODULE = module
EMPTY_KERNEL = empty_kernel
SMALL_KERNEL = small_kernel
KERNEL_16_ARGS = kernel_16_args
STREAM = stream
FLOAT_PTR = float_ptr
INT_PTRS = int_ptrs
_VAL_PS = val_ps
PACKED_16 = packed_16


def bench_launch_empty_kernel(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = EMPTY_KERNEL
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, 0, 0)
return time.perf_counter() - t0


def bench_launch_small_kernel(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = SMALL_KERNEL
_stream = STREAM
_args = (FLOAT_PTR,)
_arg_types = (None,)

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_args = INT_PTRS
_arg_types = (None,) * 16

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, (_args, _arg_types), 0)
return time.perf_counter() - t0


def bench_launch_16_args_pre_packed(loops: int) -> float:
_ensure_launch_state()
_cuLaunchKernel = cuda.cuLaunchKernel
_kernel = KERNEL_16_ARGS
_stream = STREAM
_packed = PACKED_16

t0 = time.perf_counter()
for _ in range(loops):
_cuLaunchKernel(_kernel, 1, 1, 1, 1, 1, 1, 0, _stream, _packed, 0)
return time.perf_counter() - t0
45 changes: 45 additions & 0 deletions cuda_bindings/benchmarks/benchmarks/bench_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import time

from runner.runtime import ensure_context

from cuda.bindings import driver as cuda

ensure_context()

_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)


def bench_stream_create_destroy(loops: int) -> float:
_cuStreamCreate = cuda.cuStreamCreate
_cuStreamDestroy = cuda.cuStreamDestroy
_flags = cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value

t0 = time.perf_counter()
for _ in range(loops):
_, s = _cuStreamCreate(_flags)
_cuStreamDestroy(s)
return time.perf_counter() - t0


def bench_stream_query(loops: int) -> float:
_cuStreamQuery = cuda.cuStreamQuery
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamQuery(_stream)
return time.perf_counter() - t0


def bench_stream_synchronize(loops: int) -> float:
_cuStreamSynchronize = cuda.cuStreamSynchronize
_stream = STREAM

t0 = time.perf_counter()
for _ in range(loops):
_cuStreamSynchronize(_stream)
return time.perf_counter() - t0
Loading