diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index a21cd8a8aa5..5eb4716b981 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -353,8 +353,8 @@ StreamHandle get_per_thread_stream() { namespace { struct EventBox { CUevent resource; - bool timing_disabled; - bool busy_waited; + bool timing_enabled; + bool is_blocking_sync; bool ipc_enabled; int device_id; ContextHandle h_context; @@ -368,12 +368,12 @@ static const EventBox* get_box(const EventHandle& h) { ); } -bool get_event_timing_disabled(const EventHandle& h) noexcept { - return h ? get_box(h)->timing_disabled : true; +bool get_event_timing_enabled(const EventHandle& h) noexcept { + return h ? get_box(h)->timing_enabled : false; } -bool get_event_busy_waited(const EventHandle& h) noexcept { - return h ? get_box(h)->busy_waited : false; +bool get_event_is_blocking_sync(const EventHandle& h) noexcept { + return h ? get_box(h)->is_blocking_sync : false; } bool get_event_ipc_enabled(const EventHandle& h) noexcept { @@ -392,7 +392,7 @@ ContextHandle get_event_context(const EventHandle& h) noexcept { static HandleRegistry event_registry; EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, - bool timing_disabled, bool busy_waited, + bool timing_enabled, bool is_blocking_sync, bool ipc_enabled, int device_id) { GILReleaseGuard gil; CUevent event; @@ -401,7 +401,7 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, } auto box = std::shared_ptr( - new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx}, + new EventBox{event, timing_enabled, is_blocking_sync, ipc_enabled, device_id, h_ctx}, [h_ctx](const EventBox* b) { event_registry.unregister_handle(b->resource); GILReleaseGuard gil; @@ -415,19 +415,19 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, } EventHandle create_event_handle_noctx(unsigned int flags) { - return create_event_handle(ContextHandle{}, flags, true, false, false, -1); + return create_event_handle(ContextHandle{}, flags, false, false, false, -1); } EventHandle create_event_handle_ref(CUevent event) { if (auto h = event_registry.lookup(event)) { return h; } - auto box = std::make_shared(EventBox{event, true, false, false, -1, {}}); + auto box = std::make_shared(EventBox{event, false, false, false, -1, {}}); return EventHandle(box, &box->resource); } EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, - bool busy_waited) { + bool is_blocking_sync) { GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { @@ -435,7 +435,7 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, } auto box = std::shared_ptr( - new EventBox{event, true, busy_waited, true, -1, {}}, + new EventBox{event, false, is_blocking_sync, true, -1, {}}, [](const EventBox* b) { event_registry.unregister_handle(b->resource); GILReleaseGuard gil; diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index d63fb869973..2e6ebb6271c 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -211,7 +211,7 @@ StreamHandle get_per_thread_stream(); // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, - bool timing_disabled, bool busy_waited, + bool timing_enabled, bool is_blocking_sync, bool ipc_enabled, int device_id); // Create an owning event handle without context dependency. @@ -225,17 +225,17 @@ EventHandle create_event_handle_noctx(unsigned int flags); // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, - bool busy_waited); + bool is_blocking_sync); // Create a non-owning event handle (references existing event). // Use for events that are managed by the CUDA graph or another owner. // The event will NOT be destroyed when the handle is released. -// Metadata defaults to unknown (timing_disabled=true, device_id=-1). +// Metadata defaults to unknown (timing_enabled=false, device_id=-1). EventHandle create_event_handle_ref(CUevent event); // Event metadata accessors (read from EventBox via pointer arithmetic) -bool get_event_timing_disabled(const EventHandle& h) noexcept; -bool get_event_busy_waited(const EventHandle& h) noexcept; +bool get_event_timing_enabled(const EventHandle& h) noexcept; +bool get_event_is_blocking_sync(const EventHandle& h) noexcept; bool get_event_ipc_enabled(const EventHandle& h) noexcept; int get_event_device_id(const EventHandle& h) noexcept; ContextHandle get_event_context(const EventHandle& h) noexcept; diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 4a0491d8650..5fde724d21a 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -13,8 +13,8 @@ from cuda.core._resource_handles cimport ( EventHandle, create_event_handle, create_event_handle_ipc, - get_event_timing_disabled, - get_event_busy_waited, + get_event_timing_enabled, + get_event_is_blocking_sync, get_event_ipc_enabled, get_event_device_id, get_event_context, @@ -44,22 +44,22 @@ cdef class EventOptions: Attributes ---------- - enable_timing : bool, optional + timing_enabled : bool, optional Event will record timing data. (Default to False) - busy_waited_sync : bool, optional - If True, event will use blocking synchronization. When a CPU - thread calls synchronize, the call will block until the event - has actually been completed. - Otherwise, the CPU thread will busy-wait until the event has - been completed. (Default to False) + blocking_sync : bool, optional + If True, the event uses blocking synchronization: a CPU + thread that calls :meth:`Event.sync` blocks (yields) until + the event has completed. Otherwise (the default), the CPU + thread busy-waits until the event has completed. + (Default to False) ipc_enabled : bool, optional Event will be suitable for interprocess use. - Note that enable_timing must be False. (Default to False) + Note that timing_enabled must be False. (Default to False) """ - enable_timing: bool | None = False - busy_waited_sync: bool | None = False + timing_enabled: bool | None = False + blocking_sync: bool | None = False ipc_enabled: bool | None = False @@ -79,8 +79,8 @@ cdef class Event: # To create events and record the timing: s = Device().create_stream() - e1 = Device().create_event({"enable_timing": True}) - e2 = Device().create_event({"enable_timing": True}) + e1 = Device().create_event({"timing_enabled": True}) + e2 = Device().create_event({"timing_enabled": True}) s.record(e1) # ... run some GPU works ... s.record(e2) @@ -100,16 +100,16 @@ cdef class Event: cdef Event self = cls.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") cdef unsigned int flags = 0x0 - cdef bint timing_disabled = False - cdef bint busy_waited = False + cdef bint timing_enabled = True + cdef bint is_blocking_sync = False cdef bint ipc_enabled = False self._ipc_descriptor = None - if not opts.enable_timing: + if not opts.timing_enabled: flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING - timing_disabled = True - if opts.busy_waited_sync: + timing_enabled = False + if opts.blocking_sync: flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC - busy_waited = True + is_blocking_sync = True if opts.ipc_enabled: if is_free: raise TypeError( @@ -117,23 +117,24 @@ cdef class Event: ) flags |= cydriver.CUevent_flags.CU_EVENT_INTERPROCESS ipc_enabled = True - if not timing_disabled: + if timing_enabled: raise TypeError("IPC-enabled events cannot use timing.") cdef EventHandle h_event = create_event_handle( - h_context, flags, timing_disabled, busy_waited, ipc_enabled, device_id) + h_context, flags, timing_enabled, is_blocking_sync, ipc_enabled, device_id) if not h_event: raise RuntimeError("Failed to create CUDA event") self._h_event = h_event if ipc_enabled: - self.get_ipc_descriptor() + _ = self.ipc_descriptor # eagerly populate the descriptor cache return self @staticmethod cdef Event _from_handle(EventHandle h_event): """Create an Event wrapping an existing EventHandle. - Metadata (timing, busy_waited, ipc, device_id) is read from the - EventBox via pointer arithmetic — no fields are cached on Event. + Metadata (timing, blocking_sync, ipc, device_id) is read from + the EventBox via pointer arithmetic — no fields are cached on + Event. """ cdef Event self = Event.__new__(Event) self._h_event = h_event @@ -163,10 +164,10 @@ cdef class Event: return timing else: if err == cydriver.CUresult.CUDA_ERROR_INVALID_HANDLE: - if self.is_timing_disabled or other.is_timing_disabled: + if not self.is_timing_enabled or not other.is_timing_enabled: explanation = ( "Both Events must be created with timing enabled in order to subtract them; " - "use EventOptions(enable_timing=True) when creating both events." + "use EventOptions(timing_enabled=True) when creating both events." ) else: explanation = ( @@ -196,8 +197,9 @@ cdef class Event: def __repr__(self) -> str: return f"" - def get_ipc_descriptor(self) -> IPCEventDescriptor: - """Export an event allocated for sharing between processes.""" + @property + def ipc_descriptor(self) -> IPCEventDescriptor: + """Descriptor for sharing this event with other processes.""" if self._ipc_descriptor is not None: return self._ipc_descriptor if not self.is_ipc_enabled: @@ -206,7 +208,7 @@ cdef class Event: with nogil: HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, as_cu(self._h_event))) cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) - self._ipc_descriptor = IPCEventDescriptor._init(data_b, get_event_busy_waited(self._h_event)) + self._ipc_descriptor = IPCEventDescriptor._init(data_b, get_event_is_blocking_sync(self._h_event)) return self._ipc_descriptor @classmethod @@ -215,7 +217,7 @@ cdef class Event: cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) - cdef EventHandle h_event = create_event_handle_ipc(data, ipc_descriptor._busy_waited) + cdef EventHandle h_event = create_event_handle_ipc(data, ipc_descriptor._is_blocking_sync) if not h_event: raise RuntimeError("Failed to open IPC event handle") self._h_event = h_event @@ -228,23 +230,24 @@ cdef class Event: return get_event_ipc_enabled(self._h_event) @property - def is_timing_disabled(self) -> bool: - """Return True if the event does not record timing data, otherwise False.""" - return get_event_timing_disabled(self._h_event) + def is_timing_enabled(self) -> bool: + """Return True if the event records timing data, otherwise False.""" + return get_event_timing_enabled(self._h_event) @property - def is_sync_busy_waited(self) -> bool: - """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False.""" - return get_event_busy_waited(self._h_event) + def is_blocking_sync(self) -> bool: + """Return True if the event uses blocking synchronization (the CPU + thread blocks on :meth:`sync` instead of busy-waiting), otherwise False. + """ + return get_event_is_blocking_sync(self._h_event) def sync(self): """Synchronize until the event completes. - If the event was created with busy_waited_sync, then the - calling CPU thread will block until the event has been - completed by the device. - Otherwise the CPU thread will busy-wait until the event - has been completed. + If the event was created with ``blocking_sync=True``, the + calling CPU thread blocks (yields) until the event has been + completed by the device. Otherwise (the default) the CPU + thread busy-waits until the event has completed. """ with nogil: @@ -302,28 +305,28 @@ cdef class IPCEventDescriptor: cdef: bytes _reserved - bint _busy_waited + bint _is_blocking_sync def __init__(self, *arg, **kwargs): raise RuntimeError("IPCEventDescriptor objects cannot be instantiated directly. Please use Event APIs.") @staticmethod - def _init(reserved: bytes, busy_waited: cython.bint): + def _init(reserved: bytes, is_blocking_sync: cython.bint): cdef IPCEventDescriptor self = IPCEventDescriptor.__new__(IPCEventDescriptor) self._reserved = reserved - self._busy_waited = busy_waited + self._is_blocking_sync = is_blocking_sync return self def __eq__(self, IPCEventDescriptor rhs): - # No need to check self._busy_waited. + # No need to check self._is_blocking_sync. return self._reserved == rhs._reserved def __reduce__(self): - return IPCEventDescriptor._init, (self._reserved, self._busy_waited) + return IPCEventDescriptor._init, (self._reserved, self._is_blocking_sync) def _reduce_event(event): check_multiprocessing_start_method() - return event.from_ipc_descriptor, (event.get_ipc_descriptor(),) + return event.from_ipc_descriptor, (event.ipc_descriptor,) multiprocessing.reduction.register(Event, _reduce_event) diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx index 35eea2de473..42ad612787c 100644 --- a/cuda_core/cuda/core/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -18,6 +18,7 @@ import numpy from cuda.core._memory import Buffer from cuda.core._tensor_map import TensorMapDescriptor as _TensorMapDescriptor_py from cuda.core._tensor_map cimport TensorMapDescriptor +from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core._utils.cuda_utils import driver from cuda.bindings cimport cydriver @@ -318,6 +319,11 @@ cdef class ParamHolder: if arg_type is driver.CUgraphConditionalHandle: prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, int(arg), i) continue + elif arg_type is GraphCondition: + prepare_arg[cydriver.CUgraphConditionalHandle]( + self.data, self.data_addresses, + (arg)._c_handle, i) + continue # If no exact types are found, fallback to slower `isinstance` check elif isinstance(arg, Buffer): if isinstance(arg.handle, int): @@ -341,6 +347,11 @@ cdef class ParamHolder: elif isinstance(arg, driver.CUgraphConditionalHandle): prepare_arg[cydriver.CUgraphConditionalHandle](self.data, self.data_addresses, arg, i) continue + elif isinstance(arg, GraphCondition): + prepare_arg[cydriver.CUgraphConditionalHandle]( + self.data, self.data_addresses, + (arg)._c_handle, i) + continue # TODO: support ctypes/numpy struct raise TypeError("the argument is of unsupported type: " + str(type(arg))) diff --git a/cuda_core/cuda/core/_launch_config.pxd b/cuda_core/cuda/core/_launch_config.pxd index 909c236309a..112007b9cfd 100644 --- a/cuda_core/cuda/core/_launch_config.pxd +++ b/cuda_core/cuda/core/_launch_config.pxd @@ -14,7 +14,7 @@ cdef class LaunchConfig: public tuple cluster public tuple block public int shmem_size - public bint cooperative_launch + public bint is_cooperative vector[cydriver.CUlaunchAttribute] _attrs object __weakref__ diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx index 0970ea36c79..b1a9a96cb29 100644 --- a/cuda_core/cuda/core/_launch_config.pyx +++ b/cuda_core/cuda/core/_launch_config.pyx @@ -11,7 +11,7 @@ from cuda.core._utils.cuda_utils import ( driver, ) -_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'cooperative_launch') +_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative') cdef class LaunchConfig: @@ -42,7 +42,7 @@ cdef class LaunchConfig: shmem_size : int, optional Dynamic shared-memory size per thread block in bytes. (Default to size 0) - cooperative_launch : bool, optional + is_cooperative : bool, optional Whether this config can be used to launch a cooperative kernel. """ @@ -50,7 +50,7 @@ cdef class LaunchConfig: # Note: attributes are declared in _launch_config.pxd def __init__(self, grid=None, cluster=None, block=None, - shmem_size=None, cooperative_launch=False): + shmem_size=None, is_cooperative=False): """Initialize LaunchConfig with validation. Parameters @@ -63,7 +63,7 @@ cdef class LaunchConfig: Block dimensions (threads per block) shmem_size : int, optional Dynamic shared memory size in bytes (default: 0) - cooperative_launch : bool, optional + is_cooperative : bool, optional Whether to launch as cooperative kernel (default: False) """ # Convert and validate grid and block dimensions @@ -90,11 +90,9 @@ cdef class LaunchConfig: else: self.shmem_size = shmem_size - # Handle cooperative_launch - self.cooperative_launch = cooperative_launch + self.is_cooperative = is_cooperative - # Validate cooperative launch support - if self.cooperative_launch and not Device().properties.cooperative_launch: + if self.is_cooperative and not Device().properties.cooperative_launch: raise CUDAError("cooperative kernels are not supported on this device") def _identity(self): @@ -136,7 +134,7 @@ cdef class LaunchConfig: drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = self.block drv_cfg.sharedMemBytes = self.shmem_size - if self.cooperative_launch: + if self.is_cooperative: attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE attr.value.cooperative = 1 self._attrs.push_back(attr) @@ -190,7 +188,7 @@ cpdef object _to_native_launch_config(LaunchConfig config): drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block drv_cfg.sharedMemBytes = config.shmem_size - if config.cooperative_launch: + if config.is_cooperative: attr = driver.CUlaunchAttribute() attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE attr.value.cooperative = 1 diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index 130b2278418..87d18f2b881 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -52,7 +52,7 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern drv_cfg = conf._to_native_launch_config() drv_cfg.hStream = as_cu(s._h_stream) - if conf.cooperative_launch: + if conf.is_cooperative: _check_cooperative_launch(kernel, conf, s) with nogil: HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL)) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index bb6fd97df6f..7de3c475d5d 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -128,7 +128,7 @@ cdef class Buffer: def __reduce__(self): # Must not serialize the parent's stream! - return Buffer._reduce_helper, (self.memory_resource, self.get_ipc_descriptor()) + return Buffer._reduce_helper, (self.memory_resource, self.ipc_descriptor) @staticmethod def from_handle( @@ -168,8 +168,9 @@ cdef class Buffer: """Import a buffer that was exported from another process.""" return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream) - def get_ipc_descriptor(self) -> IPCBufferDescriptor: - """Export a buffer allocated for sharing between processes.""" + @property + def ipc_descriptor(self) -> IPCBufferDescriptor: + """Descriptor for sharing this buffer with other processes.""" if self._ipc_data is None: self._ipc_data = IPCDataForBuffer(_ipc.Buffer_get_ipc_descriptor(self), False) return self._ipc_data.ipc_descriptor diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index 9f8e4bcd534..0f1a7f52e21 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -89,19 +89,19 @@ cdef class DeviceMemoryResource(_MemPool): :class:`DeviceMemoryResource` and can be distinguished via :attr:`DeviceMemoryResource.is_mapped`. - An MR is shared via an allocation handle obtained by calling - :meth:`DeviceMemoryResource.get_allocation_handle`. The allocation handle - has a platform-specific interpretation; however, memory IPC is currently - only supported for Linux, and in that case allocation handles are file - descriptors. After sending an allocation handle to another process, it can - be used to create an MMR by invoking + An MR is shared via an allocation handle accessed through the + :attr:`DeviceMemoryResource.allocation_handle` property. The allocation + handle has a platform-specific interpretation; however, memory IPC is + currently only supported for Linux, and in that case allocation handles + are file descriptors. After sending an allocation handle to another + process, it can be used to create an MMR by invoking :meth:`DeviceMemoryResource.from_allocation_handle`. - Buffers can be shared as serializable descriptors obtained by calling - :meth:`Buffer.get_ipc_descriptor`. In a receiving process, a shared buffer is - created by invoking :meth:`Buffer.from_ipc_descriptor` with an MMR and - buffer descriptor, where the MMR corresponds to the MR that created the - described buffer. + Buffers can be shared as serializable descriptors accessed through the + :attr:`Buffer.ipc_descriptor` property. In a receiving process, a shared + buffer is created by invoking :meth:`Buffer.from_ipc_descriptor` with an + MMR and buffer descriptor, where the MMR corresponds to the MR that + created the described buffer. To help manage the association between memory resources and buffers, a registry is provided. Every MR has a unique identifier (UUID). MMRs can be @@ -194,15 +194,12 @@ cdef class DeviceMemoryResource(_MemPool): mr._peer_accessible_by = () return mr - def get_allocation_handle(self) -> IPCAllocationHandle: - """Export the memory pool handle to be shared (requires IPC). + @property + def allocation_handle(self) -> IPCAllocationHandle: + """Shareable handle for this memory pool (requires IPC). The handle can be used to share the memory pool with other processes. The handle is cached in this `MemoryResource` and owned by it. - - Returns - ------- - The shareable handle for the memory pool. """ if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") @@ -404,7 +401,7 @@ def _deep_reduce_device_memory_resource(mr): check_multiprocessing_start_method() from .._device import Device device = Device(mr.device_id) - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle return mr.from_allocation_handle, (device, alloc_handle) diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx index 64ebcc7bc5d..0b18a1f7e3d 100644 --- a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx @@ -148,15 +148,12 @@ cdef class PinnedMemoryResource(_MemPool): _ipc.MP_from_allocation_handle(cls, alloc_handle)) return mr - def get_allocation_handle(self) -> IPCAllocationHandle: - """Export the memory pool handle to be shared (requires IPC). + @property + def allocation_handle(self) -> IPCAllocationHandle: + """Shareable handle for this memory pool (requires IPC). The handle can be used to share the memory pool with other processes. The handle is cached in this `MemoryResource` and owned by it. - - Returns - ------- - The shareable handle for the memory pool. """ if not self.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") @@ -242,7 +239,7 @@ cdef inline _PMR_init(PinnedMemoryResource self, options): def _deep_reduce_pinned_memory_resource(mr): check_multiprocessing_start_method() - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle return mr.from_allocation_handle, (alloc_handle,) diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index 1d3a0772c30..78f871b5ba2 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -48,10 +48,16 @@ cdef class KernelOccupancy: cdef class KernelAttributes: cdef: KernelHandle _h_kernel + # _device_id == -1 means "current device" (resolved per access). + # _device_id >= 0 means this view is bound to that specific device. + int _device_id + # Cache is shared across views for the same Kernel: the per-device + # view returned by __getitem__ inherits the parent's dict. dict _cache @staticmethod cdef KernelAttributes _init(KernelHandle h_kernel) + cdef KernelAttributes _view_for_device(self, int device_id) + cdef inline int _effective_device_id(self) except? -1 cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1 - cdef int _resolve_device_id(self, device_id) except? -1 diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 2eaff7fb11b..aa865382345 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -39,7 +39,15 @@ __all__ = ["Kernel", "ObjectCode"] cdef class KernelAttributes: - """Provides access to kernel attributes.""" + """Read-only view of a kernel's per-device attributes. + + The default view returned by :attr:`Kernel.attributes` is bound to + the current device, resolved at attribute-access time. Use + ``kernel.attributes[device]`` to obtain a view bound to a specific + device (an :class:`int` device ordinal or :class:`Device`). Per-device + views share the underlying cache so a value queried through one view + is visible through the others. + """ def __init__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") @@ -48,9 +56,22 @@ cdef class KernelAttributes: cdef KernelAttributes _init(KernelHandle h_kernel): cdef KernelAttributes self = KernelAttributes.__new__(KernelAttributes) self._h_kernel = h_kernel + self._device_id = -1 self._cache = {} return self + cdef KernelAttributes _view_for_device(self, int device_id): + cdef KernelAttributes view = KernelAttributes.__new__(KernelAttributes) + view._h_kernel = self._h_kernel + view._device_id = device_id + view._cache = self._cache + return view + + cdef inline int _effective_device_id(self) except? -1: + if self._device_id >= 0: + return self._device_id + return Device().device_id + cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1: """Helper function to get a cached attribute or fetch and cache it if not present.""" cdef tuple cache_key = (device_id, attribute) @@ -63,121 +84,150 @@ cdef class KernelAttributes: self._cache[cache_key] = result return result - cdef inline int _resolve_device_id(self, device_id) except? -1: - """Convert Device or int to device_id int.""" - return Device(device_id).device_id + def __getitem__(self, device) -> KernelAttributes: + """Return a view of these attributes bound to a specific device. - def max_threads_per_block(self, device_id: Device | int = None) -> int: + Parameters + ---------- + device : Device or int + The device whose attributes to query. Accepts a :class:`Device` + or a device ordinal (:class:`int`). + + Returns + ------- + KernelAttributes + A view bound to ``device`` that shares the underlying cache + with this view. + """ + return self._view_for_device(Device(device).device_id) + + @property + def max_threads_per_block(self) -> int: """int : The maximum number of threads per block. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK ) - def shared_size_bytes(self, device_id: Device | int = None) -> int: + @property + def shared_size_bytes(self) -> int: """int : The size in bytes of statically-allocated shared memory required by this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES ) - def const_size_bytes(self, device_id: Device | int = None) -> int: + @property + def const_size_bytes(self) -> int: """int : The size in bytes of user-allocated constant memory required by this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES ) - def local_size_bytes(self, device_id: Device | int = None) -> int: + @property + def local_size_bytes(self) -> int: """int : The size in bytes of local memory used by each thread of this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES ) - def num_regs(self, device_id: Device | int = None) -> int: + @property + def num_regs(self) -> int: """int : The number of registers used by each thread of this function. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_NUM_REGS + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_NUM_REGS ) - def ptx_version(self, device_id: Device | int = None) -> int: + @property + def ptx_version(self) -> int: """int : The PTX virtual architecture version for which the function was compiled. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PTX_VERSION + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_PTX_VERSION ) - def binary_version(self, device_id: Device | int = None) -> int: + @property + def binary_version(self) -> int: """int : The binary architecture version for which the function was compiled. This attribute is read-only.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_BINARY_VERSION + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_BINARY_VERSION ) - def cache_mode_ca(self, device_id: Device | int = None) -> bool: + @property + def cache_mode_ca(self) -> bool: """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set. This attribute is read-only.""" return bool( self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA ) ) - def max_dynamic_shared_size_bytes(self, device_id: Device | int = None) -> int: + @property + def max_dynamic_shared_size_bytes(self) -> int: """int : The maximum size in bytes of dynamically-allocated shared memory that can be used by this function.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES ) - def preferred_shared_memory_carveout(self, device_id: Device | int = None) -> int: + @property + def preferred_shared_memory_carveout(self) -> int: """int : The shared memory carveout preference, in percent of the total shared memory.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT ) - def cluster_size_must_be_set(self, device_id: Device | int = None) -> bool: + @property + def cluster_size_must_be_set(self) -> bool: """bool : The kernel must launch with a valid cluster size specified. This attribute is read-only.""" return bool( self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET ) ) - def required_cluster_width(self, device_id: Device | int = None) -> int: + @property + def required_cluster_width(self) -> int: """int : The required cluster width in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH ) - def required_cluster_height(self, device_id: Device | int = None) -> int: + @property + def required_cluster_height(self) -> int: """int : The required cluster height in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT ) - def required_cluster_depth(self, device_id: Device | int = None) -> int: + @property + def required_cluster_depth(self) -> int: """int : The required cluster depth in blocks.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH ) - def non_portable_cluster_size_allowed(self, device_id: Device | int = None) -> bool: + @property + def non_portable_cluster_size_allowed(self) -> bool: """bool : Whether the function can be launched with non-portable cluster size.""" return bool( self._get_cached_attribute( - self._resolve_device_id(device_id), + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, ) ) - def cluster_scheduling_policy_preference(self, device_id: Device | int = None) -> int: + @property + def cluster_scheduling_policy_preference(self) -> int: """int : The block scheduling policy of a function.""" return self._get_cached_attribute( - self._resolve_device_id(device_id), + self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE, ) diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index 9e7307e821b..0d7d20e574c 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -121,16 +121,16 @@ cdef StreamHandle get_per_thread_stream() except+ nogil # Event handles cdef EventHandle create_event_handle( const ContextHandle& h_ctx, unsigned int flags, - bint timing_disabled, bint busy_waited, + bint timing_enabled, bint is_blocking_sync, bint ipc_enabled, int device_id) except+ nogil cdef EventHandle create_event_handle_noctx(unsigned int flags) except+ nogil cdef EventHandle create_event_handle_ref(cydriver.CUevent event) except+ nogil cdef EventHandle create_event_handle_ipc( - const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint is_blocking_sync) except+ nogil # Event metadata getters -cdef bint get_event_timing_disabled(const EventHandle& h) noexcept nogil -cdef bint get_event_busy_waited(const EventHandle& h) noexcept nogil +cdef bint get_event_timing_enabled(const EventHandle& h) noexcept nogil +cdef bint get_event_is_blocking_sync(const EventHandle& h) noexcept nogil cdef bint get_event_ipc_enabled(const EventHandle& h) noexcept nogil cdef int get_event_device_id(const EventHandle& h) noexcept nogil cdef ContextHandle get_event_context(const EventHandle& h) noexcept nogil diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index 2090f5026d0..d30993cc5e8 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -72,19 +72,19 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Event handles (note: _create_event_handle* are internal due to C++ overloading) EventHandle create_event_handle "cuda_core::create_event_handle" ( const ContextHandle& h_ctx, unsigned int flags, - bint timing_disabled, bint busy_waited, + bint timing_enabled, bint is_blocking_sync, bint ipc_enabled, int device_id) except+ nogil EventHandle create_event_handle_noctx "cuda_core::create_event_handle_noctx" ( unsigned int flags) except+ nogil EventHandle create_event_handle_ref "cuda_core::create_event_handle_ref" ( cydriver.CUevent event) except+ nogil EventHandle create_event_handle_ipc "cuda_core::create_event_handle_ipc" ( - const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint is_blocking_sync) except+ nogil # Event metadata getters - bint get_event_timing_disabled "cuda_core::get_event_timing_disabled" ( + bint get_event_timing_enabled "cuda_core::get_event_timing_enabled" ( const EventHandle& h) noexcept nogil - bint get_event_busy_waited "cuda_core::get_event_busy_waited" ( + bint get_event_is_blocking_sync "cuda_core::get_event_is_blocking_sync" ( const EventHandle& h) noexcept nogil bint get_event_ipc_enabled "cuda_core::get_event_ipc_enabled" ( const EventHandle& h) noexcept nogil diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index 5b304a24d38..526c95e04ad 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -9,6 +9,7 @@ from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver +from cuda.core.graph._graph_definition cimport GraphCondition from cuda.core.graph._utils cimport _attach_host_callback_to_graph from cuda.core._resource_handles cimport as_cu from cuda.core._stream cimport Stream @@ -441,19 +442,24 @@ class GraphBuilder: def _get_conditional_context(self) -> driver.CUcontext: return self._mnff.stream.context.handle - def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditionalHandle: - """Creates a conditional handle for the graph builder. + def create_condition(self, default_value=None) -> GraphCondition: + """Create a condition variable for use with conditional nodes. + + The returned :class:`GraphCondition` object is passed to conditional-node + builder methods (:meth:`if_then`, :meth:`if_else`, :meth:`while_loop`, + :meth:`switch`). Its value is controlled at runtime by device code via + ``cudaGraphSetConditional``. Parameters ---------- default_value : int, optional - The default value to assign to the conditional handle. + The default value to assign to the condition. If None, no + default is assigned. Returns ------- - handle : driver.CUgraphConditionalHandle - The newly created conditional handle. - + GraphCondition + A condition variable for controlling conditional execution. """ if cy_driver_version() < (12, 3, 0): raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional handles") @@ -467,11 +473,12 @@ class GraphBuilder: status, _, graph, *_, _ = handle_return(driver.cuStreamGetCaptureInfo(self._mnff.stream.handle)) if status != driver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_ACTIVE: - raise RuntimeError("Cannot create a conditional handle when graph is not being built") + raise RuntimeError("Cannot create a condition when graph is not being built") - return handle_return( + raw_handle = handle_return( driver.cuGraphConditionalHandleCreate(graph, self._get_conditional_context(), default_value, flags) ) + return GraphCondition._from_handle(int(raw_handle)) def _cond_with_params(self, node_params) -> tuple: # Get current capture info to ensure we're in a valid state @@ -509,18 +516,19 @@ class GraphBuilder: ] ) - def if_cond(self, handle: driver.CUgraphConditionalHandle) -> GraphBuilder: + def if_then(self, condition: GraphCondition) -> GraphBuilder: """Adds an if condition branch and returns a new graph builder for it. - The resulting if graph will only execute the branch if the conditional - handle evaluates to true at runtime. + The resulting if graph will only execute the branch if the + condition evaluates to true at runtime. The new builder inherits work dependencies from the original builder. Parameters ---------- - handle : driver.CUgraphConditionalHandle - The handle to use for the if conditional. + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + whether the branch executes. Returns ------- @@ -532,26 +540,31 @@ class GraphBuilder: raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if") if cy_binding_version() < (12, 3, 0): raise RuntimeError(f"Binding version {'.'.join(map(str, cy_binding_version()))} does not support conditional if") + if not isinstance(condition, GraphCondition): + raise TypeError( + f"condition must be a GraphCondition object (from " + f"GraphBuilder.create_condition()), got {type(condition).__name__}") node_params = driver.CUgraphNodeParams() node_params.type = driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL - node_params.conditional.handle = handle + node_params.conditional.handle = condition.handle node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() return self._cond_with_params(node_params)[0] - def if_else(self, handle: driver.CUgraphConditionalHandle) -> tuple[GraphBuilder, GraphBuilder]: + def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]: """Adds an if-else condition branch and returns new graph builders for both branches. - The resulting if graph will execute the branch if the conditional handle + The resulting if graph will execute the branch if the condition evaluates to true at runtime, otherwise the else branch will execute. The new builders inherit work dependencies from the original builder. Parameters ---------- - handle : driver.CUgraphConditionalHandle - The handle to use for the if-else conditional. + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + which branch executes. Returns ------- @@ -563,27 +576,32 @@ class GraphBuilder: raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional if-else") if cy_binding_version() < (12, 8, 0): raise RuntimeError(f"Binding version {'.'.join(map(str, cy_binding_version()))} does not support conditional if-else") + if not isinstance(condition, GraphCondition): + raise TypeError( + f"condition must be a GraphCondition object (from " + f"GraphBuilder.create_condition()), got {type(condition).__name__}") node_params = driver.CUgraphNodeParams() node_params.type = driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL - node_params.conditional.handle = handle + node_params.conditional.handle = condition.handle node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF node_params.conditional.size = 2 node_params.conditional.ctx = self._get_conditional_context() return self._cond_with_params(node_params) - def switch(self, handle: driver.CUgraphConditionalHandle, count: int) -> tuple[GraphBuilder, ...]: + def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]: """Adds a switch condition branch and returns new graph builders for all cases. - The resulting switch graph will execute the branch that matches the - case index of the conditional handle at runtime. If no match is found, no branch - will be executed. + The resulting switch graph will execute the branch whose case index + matches the value of the condition at runtime. If no match is found, no + branch will be executed. The new builders inherit work dependencies from the original builder. Parameters ---------- - handle : driver.CUgraphConditionalHandle - The handle to use for the switch conditional. + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` selecting + which case executes. count : int The number of cases to add to the switch conditional. @@ -597,26 +615,31 @@ class GraphBuilder: raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional switch") if cy_binding_version() < (12, 8, 0): raise RuntimeError(f"Binding version {'.'.join(map(str, cy_binding_version()))} does not support conditional switch") + if not isinstance(condition, GraphCondition): + raise TypeError( + f"condition must be a GraphCondition object (from " + f"GraphBuilder.create_condition()), got {type(condition).__name__}") node_params = driver.CUgraphNodeParams() node_params.type = driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL - node_params.conditional.handle = handle + node_params.conditional.handle = condition.handle node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_SWITCH node_params.conditional.size = count node_params.conditional.ctx = self._get_conditional_context() return self._cond_with_params(node_params) - def while_loop(self, handle: driver.CUgraphConditionalHandle) -> GraphBuilder: + def while_loop(self, condition: GraphCondition) -> GraphBuilder: """Adds a while loop and returns a new graph builder for it. The resulting while loop graph will execute the branch repeatedly at runtime - until the conditional handle evaluates to false. + until the condition evaluates to false. The new builder inherits work dependencies from the original builder. Parameters ---------- - handle : driver.CUgraphConditionalHandle - The handle to use for the while loop. + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + loop continuation. Returns ------- @@ -628,9 +651,13 @@ class GraphBuilder: raise RuntimeError(f"Driver version {'.'.join(map(str, cy_driver_version()))} does not support conditional while loop") if cy_binding_version() < (12, 3, 0): raise RuntimeError(f"Binding version {'.'.join(map(str, cy_binding_version()))} does not support conditional while loop") + if not isinstance(condition, GraphCondition): + raise TypeError( + f"condition must be a GraphCondition object (from " + f"GraphBuilder.create_condition()), got {type(condition).__name__}") node_params = driver.CUgraphNodeParams() node_params.type = driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_CONDITIONAL - node_params.conditional.handle = handle + node_params.conditional.handle = condition.handle node_params.conditional.type = driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE node_params.conditional.size = 1 node_params.conditional.ctx = self._get_conditional_context() @@ -645,17 +672,15 @@ class GraphBuilder: """ self._mnff.close() - def add_child(self, child_graph: GraphBuilder): - """Adds the child :obj:`~graph.GraphBuilder` builder into self. - - The child graph builder will be added as a child node to the parent graph builder. + def embed(self, child: GraphBuilder): + """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. Parameters ---------- - child_graph : :obj:`~graph.GraphBuilder` + child : :obj:`~graph.GraphBuilder` The child graph builder. Must have finished building. """ - if not child_graph._building_ended: + if not child._building_ended: raise ValueError("Child graph has not finished building.") if not self.is_building: @@ -673,7 +698,7 @@ class GraphBuilder: [ handle_return( driver.cuGraphAddChildGraphNode( - graph_out, *deps_info_trimmed, num_dependencies_out, child_graph._mnff.graph + graph_out, *deps_info_trimmed, num_dependencies_out, child._mnff.graph ) ) ] diff --git a/cuda_core/cuda/core/graph/_graph_definition.pxd b/cuda_core/cuda/core/graph/_graph_definition.pxd index b414568e986..6c15643c2fe 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pxd +++ b/cuda_core/cuda/core/graph/_graph_definition.pxd @@ -11,6 +11,9 @@ cdef class GraphCondition: cydriver.CUgraphConditionalHandle _c_handle object __weakref__ + @staticmethod + cdef GraphCondition _from_handle(cydriver.CUgraphConditionalHandle c_handle) + cdef class GraphDefinition: cdef: diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx index 56b0af5d9ec..9a08232c556 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyx +++ b/cuda_core/cuda/core/graph/_graph_definition.pyx @@ -33,12 +33,25 @@ __all__ = ['GraphCondition', 'GraphAllocOptions', 'GraphDefinition'] cdef class GraphCondition: """A condition variable for conditional graph nodes. - Created by :meth:`GraphDefinition.create_condition` and passed to - conditional-node builder methods (``if_cond``, ``if_else``, - ``while_loop``, ``switch``). The underlying value is set at + Created by :meth:`GraphDefinition.create_condition` (or + :meth:`GraphBuilder.create_condition`) and passed to + conditional-node builder methods (:meth:`~GraphDefinition.if_then`, + :meth:`~GraphDefinition.if_else`, :meth:`~GraphDefinition.while_loop`, + :meth:`~GraphDefinition.switch`). The underlying value is set at runtime by device code via ``cudaGraphSetConditional``. + + A :class:`GraphCondition` may be passed directly as a kernel + argument to ``launch()``: the launcher unwraps it to the underlying + ``CUgraphConditionalHandle`` value so device code can update the + condition. """ + @staticmethod + cdef GraphCondition _from_handle(cydriver.CUgraphConditionalHandle c_handle): + cdef GraphCondition self = GraphCondition.__new__(GraphCondition) + self._c_handle = c_handle + return self + def __repr__(self) -> str: return f"self._c_handle:x}>" @@ -132,19 +145,19 @@ cdef class GraphDefinition: n._h_node = create_graph_node_handle(NULL, self._h_graph) return n - def alloc(self, size_t size, options: GraphAllocOptions | None = None) -> "AllocNode": + def allocate(self, size_t size, options: GraphAllocOptions | None = None) -> "AllocNode": """Add an entry-point memory allocation node (no dependencies). - See :meth:`GraphNode.alloc` for full documentation. + See :meth:`GraphNode.allocate` for full documentation. """ - return self._entry.alloc(size, options) + return self._entry.allocate(size, options) - def free(self, dptr) -> "FreeNode": + def deallocate(self, dptr) -> "FreeNode": """Add an entry-point memory free node (no dependencies). - See :meth:`GraphNode.free` for full documentation. + See :meth:`GraphNode.deallocate` for full documentation. """ - return self._entry.free(dptr) + return self._entry.deallocate(dptr) def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> "MemsetNode": """Add an entry-point memset node (no dependencies). @@ -199,19 +212,19 @@ cdef class GraphDefinition: """ return self._entry.embed(child) - def record_event(self, event) -> "EventRecordNode": + def record(self, event) -> "EventRecordNode": """Add an entry-point event record node (no dependencies). - See :meth:`GraphNode.record_event` for full documentation. + See :meth:`GraphNode.record` for full documentation. """ - return self._entry.record_event(event) + return self._entry.record(event) - def wait_event(self, event) -> "EventWaitNode": + def wait(self, event) -> "EventWaitNode": """Add an entry-point event wait node (no dependencies). - See :meth:`GraphNode.wait_event` for full documentation. + See :meth:`GraphNode.wait` for full documentation. """ - return self._entry.wait_event(event) + return self._entry.wait(event) def callback(self, fn, *, user_data=None) -> "HostCallbackNode": """Add an entry-point host callback node (no dependencies). @@ -252,16 +265,14 @@ cdef class GraphDefinition: HANDLE_RETURN(cydriver.cuGraphConditionalHandleCreate( &c_handle, as_cu(self._h_graph), ctx, default_val, flags)) - cdef GraphCondition cond = GraphCondition.__new__(GraphCondition) - cond._c_handle = c_handle - return cond + return GraphCondition._from_handle(c_handle) - def if_cond(self, condition: GraphCondition) -> "IfNode": + def if_then(self, condition: GraphCondition) -> "IfNode": """Add an entry-point if-conditional node (no dependencies). - See :meth:`GraphNode.if_cond` for full documentation. + See :meth:`GraphNode.if_then` for full documentation. """ - return self._entry.if_cond(condition) + return self._entry.if_then(condition) def if_else(self, condition: GraphCondition) -> "IfElseNode": """Add an entry-point if-else conditional node (no dependencies). diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index bd10bfa007f..36401776600 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -215,7 +215,7 @@ cdef class GraphNode: """ return GN_join(self, nodes) - def alloc(self, size_t size, options=None) -> AllocNode: + def allocate(self, size_t size, options=None) -> AllocNode: """Add a memory allocation node depending on this node. Parameters @@ -233,7 +233,7 @@ cdef class GraphNode: """ return GN_alloc(self, size, options) - def free(self, dptr: int) -> FreeNode: + def deallocate(self, dptr: int) -> FreeNode: """Add a memory free node depending on this node. Parameters @@ -317,7 +317,7 @@ cdef class GraphNode: """ return GN_embed(self, child) - def record_event(self, event: Event) -> EventRecordNode: + def record(self, event: Event) -> EventRecordNode: """Add an event record node depending on this node. Parameters @@ -332,7 +332,7 @@ cdef class GraphNode: """ return GN_record_event(self, event) - def wait_event(self, event: Event) -> EventWaitNode: + def wait(self, event: Event) -> EventWaitNode: """Add an event wait node depending on this node. Parameters @@ -382,7 +382,7 @@ cdef class GraphNode: """ return GN_callback(self, fn, user_data) - def if_cond(self, condition: GraphCondition) -> IfNode: + def if_then(self, condition: GraphCondition) -> IfNode: """Add an if-conditional node depending on this node. The body graph executes only when the condition evaluates to diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx index 86cf9eea53e..25b648bacef 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyx +++ b/cuda_core/cuda/core/graph/_subclasses.pyx @@ -151,7 +151,7 @@ cdef class KernelNode(GraphNode): def config(self) -> LaunchConfig: """A LaunchConfig reconstructed from this node's grid, block, and shmem_size. - Note: cluster dimensions and cooperative_launch are not preserved + Note: cluster dimensions and is_cooperative are not preserved by the CUDA driver's kernel node params, so they are not included. """ return LaunchConfig(grid=self._grid, block=self._block, @@ -181,7 +181,7 @@ cdef class AllocNode(GraphNode): cdef AllocNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr, size_t bytesize, int device_id, str memory_type, tuple peer_access): - """Create from known params (called by alloc() builder).""" + """Create from known params (called by allocate() builder).""" cdef AllocNode n = AllocNode.__new__(AllocNode) n._h_node = h_node n._dptr = dptr @@ -275,7 +275,7 @@ cdef class FreeNode(GraphNode): @staticmethod cdef FreeNode _create_with_params(GraphNodeHandle h_node, cydriver.CUdeviceptr dptr): - """Create from known params (called by free() builder).""" + """Create from known params (called by deallocate() builder).""" cdef FreeNode n = FreeNode.__new__(FreeNode) n._h_node = h_node n._dptr = dptr @@ -504,7 +504,7 @@ cdef class EventRecordNode(GraphNode): @staticmethod cdef EventRecordNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event): - """Create from known params (called by record_event() builder).""" + """Create from known params (called by record() builder).""" cdef EventRecordNode n = EventRecordNode.__new__(EventRecordNode) n._h_node = h_node n._h_event = h_event @@ -542,7 +542,7 @@ cdef class EventWaitNode(GraphNode): @staticmethod cdef EventWaitNode _create_with_params(GraphNodeHandle h_node, EventHandle h_event): - """Create from known params (called by wait_event() builder).""" + """Create from known params (called by wait() builder).""" cdef EventWaitNode n = EventWaitNode.__new__(EventWaitNode) n._h_node = h_node n._h_event = h_event @@ -621,7 +621,7 @@ cdef class HostCallbackNode(GraphNode): cdef class ConditionalNode(GraphNode): """Base class for conditional nodes. - When created via builder methods (if_cond, if_else, while_loop, switch), + When created via builder methods (if_then, if_else, while_loop, switch), a specific subclass (IfNode, IfElseNode, WhileNode, SwitchNode) is returned. When reconstructed from the driver on CUDA 13.2+, the correct subclass is determined via cuGraphNodeGetParams. On older diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index 379559e6c53..d59f912743d 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -32,7 +32,7 @@ New features - :class:`~_module.Kernel` adds :attr:`~_module.Kernel.num_arguments` and :attr:`~_module.Kernel.arguments_info` for introspection of kernel arguments. (#612) - Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648) -- Support launching cooperative kernels by setting :attr:`LaunchConfig.cooperative_launch` to ``True``. +- Support launching cooperative kernels by setting ``LaunchConfig.cooperative_launch`` to ``True``. - A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective options. - Expose :class:`Buffer`, :class:`DeviceMemoryResource`, :class:`LegacyPinnedMemoryResource`, and :class:`MemoryResource` to the top namespace. - Before this release, the internal :class:`Buffer` class had an ``__init__()`` constructor. To align with the design of cuda.core objects, diff --git a/cuda_core/docs/source/release/0.4.0-notes.rst b/cuda_core/docs/source/release/0.4.0-notes.rst index 929ce138f99..cbe3d432400 100644 --- a/cuda_core/docs/source/release/0.4.0-notes.rst +++ b/cuda_core/docs/source/release/0.4.0-notes.rst @@ -49,7 +49,7 @@ Fixes and enhancements - Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations by setting a higher release threshold (addresses issue #771). - Improved :class:`StridedMemoryView` creation time performance by optimizing shape and strides tuple creation using Python/C API (addresses issue #449). - Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867). -- Fixed a bug in :class:`GraphBuilder.add_child` where dependencies extracted from capturing stream were passed inconsistently with num_dependencies parameter (addresses issue #843). +- Fixed a bug in ``GraphBuilder.add_child`` where dependencies extracted from capturing stream were passed inconsistently with num_dependencies parameter (addresses issue #843). - Make :class:`Buffer` creation more performant. - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals. - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown. diff --git a/cuda_core/docs/source/release/1.0.0-notes.rst b/cuda_core/docs/source/release/1.0.0-notes.rst index 7a93eff4696..3f61a30ec1e 100644 --- a/cuda_core/docs/source/release/1.0.0-notes.rst +++ b/cuda_core/docs/source/release/1.0.0-notes.rst @@ -30,6 +30,88 @@ Breaking changes to follow the ``Graph*`` prefix convention used by ``GraphBuilder``, ``GraphDefinition``, ``GraphNode``. (`#1945 `__) +- Converted no-argument deterministic getters to properties for consistency + with the rest of the API + (`#1945 `__): + + - :meth:`Buffer.get_ipc_descriptor` -> :attr:`Buffer.ipc_descriptor` + - :meth:`Event.get_ipc_descriptor` -> :attr:`Event.ipc_descriptor` + - :meth:`DeviceMemoryResource.get_allocation_handle` -> + :attr:`DeviceMemoryResource.allocation_handle` + - :meth:`PinnedMemoryResource.get_allocation_handle` -> + :attr:`PinnedMemoryResource.allocation_handle` + +- Renamed boolean / non-noun properties for clearer naming + (`#1945 `__): + + - ``LaunchConfig.cooperative_launch`` -> :attr:`LaunchConfig.is_cooperative` + (also renames the constructor keyword argument). + - ``Event.is_timing_disabled`` -> :attr:`Event.is_timing_enabled`. + - ``Event.is_sync_busy_waited`` -> :attr:`Event.is_blocking_sync`. + - ``EventOptions.enable_timing`` -> ``EventOptions.timing_enabled`` + and ``EventOptions.busy_waited_sync`` -> ``EventOptions.blocking_sync``. + +- Renamed graph allocation methods to match :meth:`MemoryResource.allocate` / + :meth:`MemoryResource.deallocate` + (`#1945 `__): + + - ``GraphDefinition.alloc`` -> :meth:`graph.GraphDefinition.allocate` + - ``GraphDefinition.free`` -> :meth:`graph.GraphDefinition.deallocate` + - ``GraphNode.alloc`` -> :meth:`graph.GraphNode.allocate` + - ``GraphNode.free`` -> :meth:`graph.GraphNode.deallocate` + +- Cross-API consistency for graph builders + (`#1945 `__): + + - ``GraphBuilder.add_child`` -> :meth:`graph.GraphBuilder.embed` + (matches :meth:`graph.GraphDefinition.embed` and + :meth:`graph.GraphNode.embed`). + - ``GraphDefinition.record_event`` / ``wait_event`` -> + :meth:`graph.GraphDefinition.record` / :meth:`graph.GraphDefinition.wait` + and the same on :class:`~graph.GraphNode`, matching + :meth:`Stream.record` / :meth:`Stream.wait`. + +- :class:`KernelAttributes` methods are now properties; per-device queries + use indexing + (`#1945 `__): + + - The 17 attribute methods (``max_threads_per_block``, ``num_regs``, + ``shared_size_bytes``, ``cluster_scheduling_policy_preference``, etc.) + that previously took a ``device_id`` argument are now properties on + the view returned by :attr:`Kernel.attributes`. The view is bound to + the current device by default; ``kernel.attributes[device]`` returns + a view bound to a specific :class:`Device` or device ordinal. The + cache is shared across views of the same kernel. + - Old: ``kernel.attributes.num_regs()`` and + ``kernel.attributes.num_regs(some_dev)`` + - New: ``kernel.attributes.num_regs`` and + ``kernel.attributes[some_dev].num_regs`` + +- Unified the conditional graph API on :class:`~graph.GraphCondition` + and consistent verbs + (`#1945 `__): + + - ``GraphBuilder.create_conditional_handle`` -> + :meth:`graph.GraphBuilder.create_condition`. The new factory returns a + :class:`~graph.GraphCondition` (matching + :meth:`graph.GraphDefinition.create_condition`) instead of a raw + ``CUgraphConditionalHandle``. The four conditional builder methods + (:meth:`~graph.GraphBuilder.if_then`, + :meth:`~graph.GraphBuilder.if_else`, + :meth:`~graph.GraphBuilder.while_loop`, + :meth:`~graph.GraphBuilder.switch`) now accept a + :class:`~graph.GraphCondition` instead of a raw handle. + - ``GraphBuilder.if_cond`` / ``GraphDefinition.if_cond`` / + ``GraphNode.if_cond`` -> :meth:`graph.GraphBuilder.if_then` / + :meth:`graph.GraphDefinition.if_then` / + :meth:`graph.GraphNode.if_then`. The new name parallels the existing + ``if_else``, ``while_loop``, and ``switch`` methods (verb describing the + control-flow construct, not an abbreviation of "condition") and matches + Python's own ``if/then/else`` vocabulary. + - A :class:`~graph.GraphCondition` may be passed directly as a kernel + argument to ``launch()``; the launcher unwraps it to the underlying + ``CUgraphConditionalHandle`` value. Previously, ``.handle`` had to be + extracted explicitly. Fixes and enhancements diff --git a/cuda_core/tests/graph/test_graph_builder.py b/cuda_core/tests/graph/test_graph_builder.py index aca5a83ecfc..c0299df5661 100644 --- a/cuda_core/tests/graph/test_graph_builder.py +++ b/cuda_core/tests/graph/test_graph_builder.py @@ -234,7 +234,7 @@ def test_graph_child_graph(init_cuda): ## Add child try: - gb_parent.add_child(gb_child) + gb_parent.embed(gb_child) except NotImplementedError as e: with pytest.raises( NotImplementedError, diff --git a/cuda_core/tests/graph/test_graph_builder_conditional.py b/cuda_core/tests/graph/test_graph_builder_conditional.py index 1446b8b3c4f..de65848c1a0 100644 --- a/cuda_core/tests/graph/test_graph_builder_conditional.py +++ b/cuda_core/tests/graph/test_graph_builder_conditional.py @@ -35,17 +35,17 @@ def test_graph_conditional_if(init_cuda, condition_value): # Add Node A (sets condition) try: - handle = gb.create_conditional_handle() + condition = gb.create_condition() except RuntimeError as e: with pytest.raises(RuntimeError, match="^Driver version"): raise e gb.end_building() b.close() pytest.skip("Driver does not support conditional handle") - launch(gb, LaunchConfig(grid=1, block=1), set_handle, handle, condition_value) + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, condition_value) # Add Node B (if condition) - gb_if = gb.if_cond(handle).begin_building() + gb_if = gb.if_then(condition).begin_building() launch(gb_if, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) gb_if_0, gb_if_1 = gb_if.split(2) launch(gb_if_0, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) @@ -98,12 +98,12 @@ def test_graph_conditional_if_else(init_cuda, condition_value): gb = Device().create_graph_builder().begin_building() # Add Node A (sets condition) - handle = gb.create_conditional_handle() - launch(gb, LaunchConfig(grid=1, block=1), set_handle, handle, condition_value) + condition = gb.create_condition() + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, condition_value) # Add Node B (if condition) try: - gb_if, gb_else = gb.if_else(handle) + gb_if, gb_else = gb.if_else(condition) except RuntimeError as e: with pytest.raises(RuntimeError, match="^(Driver|Binding) version"): raise e @@ -171,12 +171,12 @@ def test_graph_conditional_switch(init_cuda, condition_value): gb = Device().create_graph_builder().begin_building() # Add Node A (sets condition) - handle = gb.create_conditional_handle() - launch(gb, LaunchConfig(grid=1, block=1), set_handle, handle, condition_value) + condition = gb.create_condition() + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, condition_value) # Add Node B (while condition) try: - gb_case = list(gb.switch(handle, 3)) + gb_case = list(gb.switch(condition, 3)) except RuntimeError as e: with pytest.raises(RuntimeError, match="^(Driver|Binding) version"): raise e @@ -261,14 +261,14 @@ def test_graph_conditional_while(init_cuda, condition_value): gb = Device().create_graph_builder().begin_building() # Node A is skipped because we can instead use a non-zero default value - handle = gb.create_conditional_handle(default_value=condition_value) + condition = gb.create_condition(default_value=condition_value) # Add Node B (while condition) - gb_while = gb.while_loop(handle) + gb_while = gb.while_loop(condition) gb_while.begin_building() launch(gb_while, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) launch(gb_while, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data) - launch(gb_while, LaunchConfig(grid=1, block=1), loop_kernel, handle) + launch(gb_while, LaunchConfig(grid=1, block=1), loop_kernel, condition) gb_while.end_building() # Add Node C (...) diff --git a/cuda_core/tests/graph/test_graph_definition.py b/cuda_core/tests/graph/test_graph_definition.py index 4e0f28e22dc..e85645e0305 100644 --- a/cuda_core/tests/graph/test_graph_definition.py +++ b/cuda_core/tests/graph/test_graph_definition.py @@ -91,7 +91,7 @@ def _build_empty(): def _build_single(): """One alloc node, no edges.""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) return GraphSpec( "single", g, @@ -105,9 +105,9 @@ def _build_single(): def _build_chain(): """Linear chain: a -> b -> c.""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) - b = a.alloc(ALLOC_SIZE) - c = b.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = a.allocate(ALLOC_SIZE) + c = b.allocate(ALLOC_SIZE) return GraphSpec( "chain", g, @@ -121,10 +121,10 @@ def _build_chain(): def _build_fan_out(): """One node feeds three: a -> {b, c, d}.""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) - b = a.alloc(ALLOC_SIZE) - c = a.alloc(ALLOC_SIZE) - d = a.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = a.allocate(ALLOC_SIZE) + c = a.allocate(ALLOC_SIZE) + d = a.allocate(ALLOC_SIZE) return GraphSpec( "fan_out", g, @@ -138,9 +138,9 @@ def _build_fan_out(): def _build_fan_in(): """Three entry nodes merge: {a, b, c} -> d (join).""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) - b = g.alloc(ALLOC_SIZE) - c = g.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = g.allocate(ALLOC_SIZE) + c = g.allocate(ALLOC_SIZE) d = g.join(a, b, c) return GraphSpec( "fan_in", @@ -155,9 +155,9 @@ def _build_fan_in(): def _build_diamond(): """Diamond: a -> {b, c} -> d (join).""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) - b = a.alloc(ALLOC_SIZE) - c = a.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = a.allocate(ALLOC_SIZE) + c = a.allocate(ALLOC_SIZE) d = b.join(c) return GraphSpec( "diamond", @@ -172,8 +172,8 @@ def _build_diamond(): def _build_disconnected(): """Two independent entry nodes: a, b.""" g = GraphDefinition() - a = g.alloc(ALLOC_SIZE) - b = g.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = g.allocate(ALLOC_SIZE) return GraphSpec( "disconnected", g, @@ -238,8 +238,8 @@ def roundtrip_class(self): def _build_empty_node(g): - a = g.alloc(ALLOC_SIZE) - b = g.alloc(ALLOC_SIZE) + a = g.allocate(ALLOC_SIZE) + b = g.allocate(ALLOC_SIZE) return g.join(a, b), {} @@ -247,7 +247,7 @@ def _build_kernel_node(g): mod = compile_common_kernels() kernel = mod.get_kernel("empty_kernel") config = LaunchConfig(grid=(2, 3, 1), block=(32, 4, 1), shmem_size=128) - entry = g.alloc(ALLOC_SIZE) + entry = g.allocate(ALLOC_SIZE) node = entry.launch(config, kernel) return node, { "grid": (2, 3, 1), @@ -260,8 +260,8 @@ def _build_kernel_node(g): def _build_alloc_node(g): device_id = Device().device_id - entry = g.alloc(ALLOC_SIZE) - node = entry.alloc(ALLOC_SIZE) + entry = g.allocate(ALLOC_SIZE) + node = entry.allocate(ALLOC_SIZE) return node, { "dptr": lambda v: v != 0, "bytesize": ALLOC_SIZE, @@ -276,8 +276,8 @@ def _build_alloc_managed_node(g): _skip_if_no_managed_mempool() device_id = Device().device_id options = GraphAllocOptions(memory_type="managed") - entry = g.alloc(ALLOC_SIZE) - node = entry.alloc(ALLOC_SIZE, options) + entry = g.allocate(ALLOC_SIZE) + node = entry.allocate(ALLOC_SIZE, options) return node, { "dptr": lambda v: v != 0, "bytesize": ALLOC_SIZE, @@ -289,15 +289,15 @@ def _build_alloc_managed_node(g): def _build_free_node(g): - alloc = g.alloc(ALLOC_SIZE) - node = alloc.free(alloc.dptr) + alloc = g.allocate(ALLOC_SIZE) + node = alloc.deallocate(alloc.dptr) return node, { "dptr": alloc.dptr, } def _build_memset_node(g): - alloc = g.alloc(ALLOC_SIZE) + alloc = g.allocate(ALLOC_SIZE) node = alloc.memset(alloc.dptr, 42, ALLOC_SIZE) return node, { "dptr": alloc.dptr, @@ -310,7 +310,7 @@ def _build_memset_node(g): def _build_memset_node_u16(g): - alloc = g.alloc(ALLOC_SIZE) + alloc = g.allocate(ALLOC_SIZE) node = alloc.memset(alloc.dptr, b"\xab\xcd", ALLOC_SIZE // 2) return node, { "dptr": alloc.dptr, @@ -323,7 +323,7 @@ def _build_memset_node_u16(g): def _build_memset_node_u32(g): - alloc = g.alloc(ALLOC_SIZE) + alloc = g.allocate(ALLOC_SIZE) node = alloc.memset(alloc.dptr, b"\x01\x02\x03\x04", ALLOC_SIZE // 4) return node, { "dptr": alloc.dptr, @@ -338,7 +338,7 @@ def _build_memset_node_u32(g): def _build_memset_node_2d(g): rows = 4 cols = ALLOC_SIZE // rows - alloc = g.alloc(ALLOC_SIZE) + alloc = g.allocate(ALLOC_SIZE) node = alloc.memset(alloc.dptr, 0xFF, cols, height=rows, pitch=cols) return node, { "dptr": alloc.dptr, @@ -352,8 +352,8 @@ def _build_memset_node_2d(g): def _build_event_record_node(g): event = Device().create_event() - entry = g.alloc(ALLOC_SIZE) - node = entry.record_event(event) + entry = g.allocate(ALLOC_SIZE) + node = entry.record(event) return node, { "event": event, } @@ -361,16 +361,16 @@ def _build_event_record_node(g): def _build_event_wait_node(g): event = Device().create_event() - entry = g.alloc(ALLOC_SIZE) - node = entry.wait_event(event) + entry = g.allocate(ALLOC_SIZE) + node = entry.wait(event) return node, { "event": event, } def _build_memcpy_node(g): - src_alloc = g.alloc(ALLOC_SIZE) - dst_alloc = g.alloc(ALLOC_SIZE) + src_alloc = g.allocate(ALLOC_SIZE) + dst_alloc = g.allocate(ALLOC_SIZE) dep = g.join(src_alloc, dst_alloc) node = dep.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) return node, { @@ -416,9 +416,9 @@ def _build_child_graph_node(g): } -def _build_if_cond_node(g): +def _build_if_then_node(g): condition = try_create_condition(g) - node = g.if_cond(condition) + node = g.if_then(condition) return node, { "condition": condition, "cond_type": "if", @@ -514,14 +514,14 @@ def _build_switch_node(g): ), pytest.param( NodeSpec( - "if_cond", + "if_then", IfNode, "CU_GRAPH_NODE_TYPE_CONDITIONAL", - _build_if_cond_node, + _build_if_then_node, reconstructed_class=IfNode if _HAS_NODE_GET_PARAMS else ConditionalNode, needs_mempool=False, ), - id="if_cond", + id="if_then", ), pytest.param( NodeSpec( @@ -800,24 +800,24 @@ def test_alloc_zero_size_fails(sample_graphdef): from cuda.core._utils.cuda_utils import CUDAError with pytest.raises(CUDAError): - sample_graphdef.alloc(0) + sample_graphdef.allocate(0) def test_free_creates_dependency(sample_graphdef): """Free node depends on its predecessor.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) - free = alloc.free(alloc.dptr) + alloc = sample_graphdef.allocate(ALLOC_SIZE) + free = alloc.deallocate(alloc.dptr) assert alloc in free.pred def test_alloc_free_chain(sample_graphdef): """Alloc and free can be chained.""" _skip_if_no_mempool() - a1 = sample_graphdef.alloc(ALLOC_SIZE) - a2 = a1.alloc(ALLOC_SIZE) - f2 = a2.free(a2.dptr) - f1 = f2.free(a1.dptr) + a1 = sample_graphdef.allocate(ALLOC_SIZE) + a2 = a1.allocate(ALLOC_SIZE) + f2 = a2.deallocate(a2.dptr) + f1 = f2.deallocate(a1.dptr) assert a1 in a2.pred assert a2 in f2.pred assert f2 in f1.pred @@ -832,7 +832,7 @@ def test_alloc_memory_type_invalid(sample_graphdef): """Invalid memory type raises ValueError.""" options = GraphAllocOptions(memory_type="invalid") with pytest.raises(ValueError, match="Invalid memory_type"): - sample_graphdef.alloc(ALLOC_SIZE, options) + sample_graphdef.allocate(ALLOC_SIZE, options) @pytest.mark.parametrize( @@ -847,7 +847,7 @@ def test_alloc_device_option(sample_graphdef, device_spec): _skip_if_no_mempool() device = Device() options = GraphAllocOptions(device=device_spec(device)) - node = sample_graphdef.alloc(ALLOC_SIZE, options) + node = sample_graphdef.allocate(ALLOC_SIZE, options) assert node.dptr != 0 @@ -856,7 +856,7 @@ def test_alloc_peer_access(mempool_device_x2): d0, d1 = mempool_device_x2 g = GraphDefinition() options = GraphAllocOptions(device=d0.device_id, peer_access=[d1.device_id]) - node = g.alloc(ALLOC_SIZE, options) + node = g.allocate(ALLOC_SIZE, options) assert d1.device_id in node.peer_access @@ -869,7 +869,7 @@ def test_alloc_peer_access(mempool_device_x2): def test_join_merges_branches(sample_graphdef, num_branches): """join() with multiple branches creates correct dependencies.""" _skip_if_no_mempool() - branches = [sample_graphdef.alloc(ALLOC_SIZE) for _ in range(num_branches)] + branches = [sample_graphdef.allocate(ALLOC_SIZE) for _ in range(num_branches)] joined = sample_graphdef.join(*branches) assert isinstance(joined, EmptyNode) assert set(joined.pred) == set(branches) @@ -962,8 +962,8 @@ def test_instantiate_empty_graph(sample_graphdef, inst_kwargs): def test_instantiate_with_nodes(sample_graphdef, inst_kwargs): """Graph with nodes can be instantiated.""" _skip_if_no_mempool() - sample_graphdef.alloc(ALLOC_SIZE) - sample_graphdef.alloc(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) graph = _instantiate(sample_graphdef, inst_kwargs) assert graph is not None @@ -1003,8 +1003,8 @@ def test_instantiate_and_execute_kernel(sample_graphdef, inst_kwargs): def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs): """Graph with alloc/free can be executed.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) - alloc.free(alloc.dptr) + alloc = sample_graphdef.allocate(ALLOC_SIZE) + alloc.deallocate(alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1016,9 +1016,9 @@ def test_instantiate_and_execute_alloc_free(sample_graphdef, inst_kwargs): def test_instantiate_and_execute_memset(sample_graphdef, inst_kwargs): """Graph with alloc/memset/free can be executed.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) + alloc = sample_graphdef.allocate(ALLOC_SIZE) ms = alloc.memset(alloc.dptr, 0xAB, ALLOC_SIZE) - ms.free(alloc.dptr) + ms.deallocate(alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1032,12 +1032,12 @@ def test_instantiate_and_execute_memcpy(sample_graphdef, inst_kwargs): _skip_if_no_mempool() import ctypes - src_alloc = sample_graphdef.alloc(ALLOC_SIZE) - dst_alloc = sample_graphdef.alloc(ALLOC_SIZE) + src_alloc = sample_graphdef.allocate(ALLOC_SIZE) + dst_alloc = sample_graphdef.allocate(ALLOC_SIZE) dep = sample_graphdef.join(src_alloc, dst_alloc) ms = dep.memset(src_alloc.dptr, 0xAB, ALLOC_SIZE) cp = ms.memcpy(dst_alloc.dptr, src_alloc.dptr, ALLOC_SIZE) - cp.free(src_alloc.dptr) + cp.deallocate(src_alloc.dptr) stream = Device().create_stream() graph = _instantiate_and_upload(sample_graphdef, inst_kwargs, stream) @@ -1139,8 +1139,8 @@ def test_host_callback_user_data_rejected_for_python_callable(sample_graphdef): def test_instantiate_and_execute_event_record_wait(sample_graphdef): """Graph with event record and wait nodes can be executed.""" event = Device().create_event() - rec = sample_graphdef.record_event(event) - rec.wait_event(event) + rec = sample_graphdef.record(event) + rec.wait(event) graph = sample_graphdef.instantiate() stream = Device().create_stream() @@ -1159,7 +1159,7 @@ def _skip_unless_cc_90(): pytest.skip("Conditional node execution requires CC >= 9.0 (Hopper)") -def test_instantiate_and_execute_if_cond(sample_graphdef): +def test_instantiate_and_execute_if_then(sample_graphdef): """If-conditional node: body executes only when condition is non-zero.""" _skip_unless_cc_90() _skip_if_no_mempool() @@ -1172,10 +1172,10 @@ def test_instantiate_and_execute_if_cond(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 1) - if_node = setter.if_cond(condition) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 1) + if_node = setter.if_then(condition) if_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) graph = sample_graphdef.instantiate() @@ -1204,9 +1204,9 @@ def test_instantiate_and_execute_if_else(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 0) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 0) ie_node = setter.if_else(condition) ie_node.then.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) n1 = ie_node.else_.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) @@ -1238,9 +1238,9 @@ def test_instantiate_and_execute_switch(sample_graphdef): set_handle = mod.get_kernel("set_handle") add_one = mod.get_kernel("add_one") - alloc = sample_graphdef.alloc(ctypes.sizeof(ctypes.c_int)) + alloc = sample_graphdef.allocate(ctypes.sizeof(ctypes.c_int)) ms = alloc.memset(alloc.dptr, 0, ctypes.sizeof(ctypes.c_int)) - setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition.handle, 2) + setter = ms.launch(LaunchConfig(grid=1, block=1), set_handle, condition, 2) sw_node = setter.switch(condition, 4) for branch in sw_node.branches: branch.launch(LaunchConfig(grid=1, block=1), add_one, alloc.dptr) @@ -1261,7 +1261,7 @@ def test_instantiate_and_execute_switch(sample_graphdef): def test_conditional_node_type_preserved_by_nodes(sample_graphdef): """Conditional nodes appear as ConditionalNode base when read back from graph.""" condition = try_create_condition(sample_graphdef) - if_node = sample_graphdef.if_cond(condition) + if_node = sample_graphdef.if_then(condition) assert isinstance(if_node, IfNode) all_nodes = sample_graphdef.nodes() @@ -1278,7 +1278,7 @@ def test_conditional_node_type_preserved_by_nodes(sample_graphdef): def test_debug_dot_print_creates_file(sample_graphdef, dot_file): """debug_dot_print writes a DOT file.""" _skip_if_no_mempool() - sample_graphdef.alloc(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) sample_graphdef.debug_dot_print(str(dot_file)) assert dot_file.exists() content = dot_file.read_text() @@ -1288,7 +1288,7 @@ def test_debug_dot_print_creates_file(sample_graphdef, dot_file): def test_debug_dot_print_with_options(sample_graphdef, dot_file): """debug_dot_print accepts GraphDebugPrintOptions.""" _skip_if_no_mempool() - sample_graphdef.alloc(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) options = GraphDebugPrintOptions(verbose=True, handles=True) sample_graphdef.debug_dot_print(str(dot_file), options) assert dot_file.exists() @@ -1297,6 +1297,6 @@ def test_debug_dot_print_with_options(sample_graphdef, dot_file): def test_debug_dot_print_invalid_options(sample_graphdef, dot_file): """debug_dot_print rejects invalid options type.""" _skip_if_no_mempool() - sample_graphdef.alloc(ALLOC_SIZE) + sample_graphdef.allocate(ALLOC_SIZE) with pytest.raises(TypeError, match="options must be a GraphDebugPrintOptions"): sample_graphdef.debug_dot_print(str(dot_file), "invalid") diff --git a/cuda_core/tests/graph/test_graph_definition_errors.py b/cuda_core/tests/graph/test_graph_definition_errors.py index 2f6935d6eb0..40f181e5db1 100644 --- a/cuda_core/tests/graph/test_graph_definition_errors.py +++ b/cuda_core/tests/graph/test_graph_definition_errors.py @@ -33,7 +33,7 @@ def _skip_if_no_mempool(): @pytest.mark.parametrize( "method, args", [ - pytest.param("if_cond", (42,), id="if_cond_int"), + pytest.param("if_then", (42,), id="if_then_int"), pytest.param("if_else", ("not a condition",), id="if_else_str"), pytest.param("while_loop", (None,), id="while_loop_none"), pytest.param("switch", ([1, 2, 3], 4), id="switch_list"), @@ -62,14 +62,14 @@ def test_free_null_pointer(init_cuda): """free(0) raises a CUDA error.""" g = GraphDefinition() with pytest.raises(CUDAError): - g.free(0) + g.deallocate(0) def test_memset_invalid_value_size(init_cuda): """memset with 3-byte value (not 1, 2, or 4) raises ValueError.""" _skip_if_no_mempool() g = GraphDefinition() - alloc = g.alloc(1024) + alloc = g.allocate(1024) with pytest.raises(ValueError): alloc.memset(alloc.dptr, b"\x01\x02\x03", 100) @@ -93,7 +93,7 @@ def test_condition_from_different_graph(init_cuda): g2 = GraphDefinition() condition = try_create_condition(g1) with pytest.raises(CUDAError): - g2.if_cond(condition) + g2.if_then(condition) # ============================================================================= @@ -113,7 +113,7 @@ def test_join_single_predecessor(init_cuda): """node.join() with no extra args creates a single-dep empty node.""" _skip_if_no_mempool() g = GraphDefinition() - a = g.alloc(1024) + a = g.allocate(1024) joined = a.join() assert isinstance(joined, EmptyNode) assert set(joined.pred) == {a} @@ -136,7 +136,7 @@ def test_unmatched_alloc_succeeds(init_cuda): """Alloc without corresponding free is valid (graph-scoped lifetime).""" _skip_if_no_mempool() g = GraphDefinition() - g.alloc(1024) + g.allocate(1024) graph = g.instantiate() stream = Device().create_stream() graph.launch(stream) @@ -174,7 +174,7 @@ def test_while_loop_zero_iterations(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=0) - alloc = g.alloc(SIZEOF_INT) + alloc = g.allocate(SIZEOF_INT) ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) loop = ms.while_loop(condition) loop.body.launch(cfg, add_one, alloc.dptr) @@ -191,7 +191,7 @@ def test_while_loop_zero_iterations(init_cuda): assert result[0] == 0, "Body should not have executed" -def test_if_cond_false_skips_body(init_cuda): +def test_if_then_false_skips_body(init_cuda): """If conditional with default_value=0 does not execute its body.""" _skip_unless_cc_90() _skip_if_no_mempool() @@ -202,9 +202,9 @@ def test_if_cond_false_skips_body(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=0) - alloc = g.alloc(SIZEOF_INT) + alloc = g.allocate(SIZEOF_INT) ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) - if_node = ms.if_cond(condition) + if_node = ms.if_then(condition) if_node.then.launch(cfg, add_one, alloc.dptr) graph = g.instantiate() @@ -230,7 +230,7 @@ def test_switch_oob_skips_all_branches(init_cuda): g = GraphDefinition() condition = g.create_condition(default_value=99) - alloc = g.alloc(SIZEOF_INT) + alloc = g.allocate(SIZEOF_INT) ms = alloc.memset(alloc.dptr, 0, SIZEOF_INT) sw = ms.switch(condition, 3) for branch in sw.branches: diff --git a/cuda_core/tests/graph/test_graph_definition_integration.py b/cuda_core/tests/graph/test_graph_definition_integration.py index 1c5eecf40b6..b33b23d8860 100644 --- a/cuda_core/tests/graph/test_graph_definition_integration.py +++ b/cuda_core/tests/graph/test_graph_definition_integration.py @@ -215,8 +215,8 @@ def _run_heat_graph(dev, k_heat, k_countdown, host_ptr): # Definitions g = GraphDefinition() condition = g.create_condition(default_value=1) - event_start = dev.create_event(EventOptions(enable_timing=True)) - event_end = dev.create_event(EventOptions(enable_timing=True)) + event_start = dev.create_event(EventOptions(timing_enabled=True)) + event_end = dev.create_event(EventOptions(timing_enabled=True)) results = {} def capture_result(): @@ -230,9 +230,9 @@ def capture_result(): # fmt: off # Phase 1 — Allocate device memory - a_curr = g.alloc(_HEAT_N * SIZEOF_FLOAT) - a_next = g.alloc(_HEAT_N * SIZEOF_FLOAT) - a_ctr = g.alloc(SIZEOF_INT) + a_curr = g.allocate(_HEAT_N * SIZEOF_FLOAT) + a_next = g.allocate(_HEAT_N * SIZEOF_FLOAT) + a_ctr = g.allocate(SIZEOF_INT) # Phase 2 — Initialise buffers m_curr = a_curr.memset(a_curr.dptr, 0, _HEAT_N * SIZEOF_FLOAT) @@ -247,23 +247,23 @@ def capture_result(): .graph p = g.join(m_curr, m_next, m_ctr) \ .embed(bc) \ - .record_event(event_start) + .record(event_start) # Phase 4 — Iterate loop = p.while_loop(condition) loop.body.launch(heat_cfg, k_heat, a_next.dptr, a_curr.dptr, np.int32(_HEAT_N), _HEAT_ALPHA) \ .memcpy(a_curr.dptr, a_next.dptr, _HEAT_N * SIZEOF_FLOAT) \ - .launch(tick_cfg, k_countdown, condition.handle, a_ctr.dptr) + .launch(tick_cfg, k_countdown, condition, a_ctr.dptr) # Phase 5 — After loop: timing end, readback, verify, free memory - loop.wait_event(event_start) \ - .record_event(event_end) \ + loop.wait(event_start) \ + .record(event_end) \ .memcpy(host_ptr, a_curr.dptr, _HEAT_N * SIZEOF_FLOAT) \ .callback(capture_result) \ - .free(a_curr.dptr) \ - .free(a_next.dptr) \ - .free(a_ctr.dptr) + .deallocate(a_curr.dptr) \ + .deallocate(a_next.dptr) \ + .deallocate(a_ctr.dptr) # fmt: on # Phase 6 — Instantiate, launch, verify @@ -332,9 +332,9 @@ def capture_result(): # fmt: off # Allocate and initialise: a = 0.0, b = 2.0, counter = ITERS - a = g.alloc(SIZEOF_FLOAT) - b = g.alloc(SIZEOF_FLOAT) - ctr = g.alloc(SIZEOF_INT) + a = g.allocate(SIZEOF_FLOAT) + b = g.allocate(SIZEOF_FLOAT) + ctr = g.allocate(SIZEOF_INT) p = g.join(a.memset(a.dptr, np.float32(0.0), 1), b.memset(b.dptr, np.float32(2.0), 1), @@ -345,23 +345,23 @@ def capture_result(): ie_cond = g.create_condition(default_value=0) loop = p.while_loop(while_cond) - ie = loop.body.launch(cfg, k_eval, a.dptr, b.dptr, ie_cond.handle) \ + ie = loop.body.launch(cfg, k_eval, a.dptr, b.dptr, ie_cond) \ .if_else(ie_cond) ie.then.launch(cfg, k_hi, a.dptr, b.dptr) ie.else_.launch(cfg, k_lo, a.dptr, b.dptr) - ie.launch(cfg, k_cd, while_cond.handle, ctr.dptr) + ie.launch(cfg, k_cd, while_cond, ctr.dptr) # Post-loop: Newton refinement (IfNode), readback, free if_cond = g.create_condition(default_value=0) - if_node = loop.launch(cfg, k_check, a.dptr, b.dptr, if_cond.handle) \ - .if_cond(if_cond) + if_node = loop.launch(cfg, k_check, a.dptr, b.dptr, if_cond) \ + .if_then(if_cond) if_node.then.launch(cfg, k_newton, a.dptr, b.dptr) if_node.memcpy(host_ptr, a.dptr, SIZEOF_FLOAT) \ .callback(capture_result) \ - .free(a.dptr) \ - .free(b.dptr) \ - .free(ctr.dptr) + .deallocate(a.dptr) \ + .deallocate(b.dptr) \ + .deallocate(ctr.dptr) # fmt: on # Instantiate, launch, verify @@ -430,7 +430,7 @@ def _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr): cfg = LaunchConfig(grid=1, block=1) # fmt: off - x = g.alloc(SIZEOF_INT) + x = g.allocate(SIZEOF_INT) sw_cond = g.create_condition(default_value=mode) sw = x.memset(x.dptr, np.int32(_SWITCH_VALUE), 1) \ .switch(sw_cond, 4) @@ -441,7 +441,7 @@ def _run_switch_graph(dev, mode, k_negate, k_double, k_square, host_ptr): # branch 3: identity (no kernel — value unchanged) sw.memcpy(host_ptr, x.dptr, SIZEOF_INT) \ - .free(x.dptr) + .deallocate(x.dptr) # fmt: on graph = g.instantiate() diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index cfa538539f6..e231016c8ac 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -29,7 +29,7 @@ def _skip_if_no_mempool(): def _make_if(g, cond): - node = g.if_cond(cond) + node = g.if_then(cond) return [node.then] @@ -172,10 +172,10 @@ def test_event_record_node_keeps_event_alive(init_cuda): _skip_if_no_mempool() dev = Device() g = GraphDefinition() - alloc = g.alloc(1024) + alloc = g.allocate(1024) - event = dev.create_event(EventOptions(enable_timing=False)) - node = alloc.record_event(event) + event = dev.create_event(EventOptions(timing_enabled=False)) + node = alloc.record(event) del event gc.collect() @@ -189,10 +189,10 @@ def test_event_wait_node_keeps_event_alive(init_cuda): _skip_if_no_mempool() dev = Device() g = GraphDefinition() - alloc = g.alloc(1024) + alloc = g.allocate(1024) - event = dev.create_event(EventOptions(enable_timing=False)) - node = alloc.wait_event(event) + event = dev.create_event(EventOptions(timing_enabled=False)) + node = alloc.wait(event) del event gc.collect() @@ -206,12 +206,12 @@ def test_event_record_node_preserves_metadata(init_cuda): dev = Device() g = GraphDefinition() - event = dev.create_event(EventOptions(enable_timing=True, busy_waited_sync=True)) - node = g.record_event(event) + event = dev.create_event(EventOptions(timing_enabled=True, blocking_sync=True)) + node = g.record(event) reconstructed = node.event - assert reconstructed.is_timing_disabled is False - assert reconstructed.is_sync_busy_waited is True + assert reconstructed.is_timing_enabled is True + assert reconstructed.is_blocking_sync is True assert reconstructed.is_ipc_enabled is False assert reconstructed.device is not None @@ -221,12 +221,12 @@ def test_event_wait_node_preserves_metadata(init_cuda): dev = Device() g = GraphDefinition() - event = dev.create_event(EventOptions(enable_timing=False)) - node = g.wait_event(event) + event = dev.create_event(EventOptions(timing_enabled=False)) + node = g.wait(event) reconstructed = node.event - assert reconstructed.is_timing_disabled is True - assert reconstructed.is_sync_busy_waited is False + assert reconstructed.is_timing_enabled is False + assert reconstructed.is_blocking_sync is False assert reconstructed.device is not None @@ -235,15 +235,15 @@ def test_event_metadata_survives_gc(init_cuda): dev = Device() g = GraphDefinition() - event = dev.create_event(EventOptions(enable_timing=True, busy_waited_sync=True)) - node = g.record_event(event) + event = dev.create_event(EventOptions(timing_enabled=True, blocking_sync=True)) + node = g.record(event) del event gc.collect() retrieved = node.event - assert retrieved.is_timing_disabled is False - assert retrieved.is_sync_busy_waited is True + assert retrieved.is_timing_enabled is True + assert retrieved.is_blocking_sync is True assert retrieved.is_done is True @@ -252,9 +252,9 @@ def test_event_survives_graph_instantiation_and_execution(init_cuda): dev = Device() g = GraphDefinition() - event = dev.create_event(EventOptions(enable_timing=False)) - rec = g.record_event(event) - rec.wait_event(event) + event = dev.create_event(EventOptions(timing_enabled=False)) + rec = g.record(event) + rec.wait(event) del event gc.collect() @@ -277,9 +277,9 @@ def test_event_survives_graph_clone_and_execution(init_cuda): dev = Device() g = GraphDefinition() - event = dev.create_event(EventOptions(enable_timing=False)) - rec = g.record_event(event) - rec.wait_event(event) + event = dev.create_event(EventOptions(timing_enabled=False)) + rec = g.record(event) + rec.wait(event) cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) @@ -390,7 +390,7 @@ def test_kernel_node_keeps_kernel_alive(init_cuda): gc.collect() retrieved = node.kernel - assert retrieved.attributes.max_threads_per_block() > 0 + assert retrieved.attributes.max_threads_per_block > 0 def test_kernel_survives_graph_instantiation_and_execution(init_cuda): @@ -455,7 +455,7 @@ def test_kernel_from_handle_recovers_library(init_cuda): del kernel, mod gc.collect() - assert reconstructed.attributes.max_threads_per_block() > 0 + assert reconstructed.attributes.max_threads_per_block > 0 def test_kernel_node_reconstruction_preserves_validity(init_cuda): @@ -469,7 +469,7 @@ def test_kernel_node_reconstruction_preserves_validity(init_cuda): kernel_node = g.launch(config, kernel) # Chain a second node so we can reconstruct the kernel node via pred event = Device().create_event() - successor = kernel_node.record_event(event) + successor = kernel_node.record(event) del kernel, mod gc.collect() @@ -479,7 +479,7 @@ def test_kernel_node_reconstruction_preserves_validity(init_cuda): # -> create_kernel_handle_ref -> handle recovery reconstructed = next(iter(successor.pred)) assert isinstance(reconstructed, KernelNode) - assert reconstructed.kernel.attributes.max_threads_per_block() > 0 + assert reconstructed.kernel.attributes.max_threads_per_block > 0 graph = g.instantiate() stream = Device().create_stream() diff --git a/cuda_core/tests/graph/test_graph_update.py b/cuda_core/tests/graph/test_graph_update.py index 01d0183de5c..206fabcd587 100644 --- a/cuda_core/tests/graph/test_graph_update.py +++ b/cuda_core/tests/graph/test_graph_update.py @@ -79,11 +79,11 @@ def build_graph(condition_value): gb = Device().create_graph_builder().begin_building() # Add Node A (sets condition) - handle = gb.create_conditional_handle(default_value=condition_value) + condition = gb.create_condition(default_value=condition_value) # Add Node B (while condition) try: - gb_case = list(gb.switch(handle, 3)) + gb_case = list(gb.switch(condition, 3)) except Exception as e: with pytest.raises(RuntimeError, match="^(Driver|Binding) version"): raise e diff --git a/cuda_core/tests/graph/test_options.py b/cuda_core/tests/graph/test_options.py index 0d10db459d6..2002c1b7006 100644 --- a/cuda_core/tests/graph/test_options.py +++ b/cuda_core/tests/graph/test_options.py @@ -18,11 +18,11 @@ def test_graph_dot_print_options(init_cuda, tmp_path): gb = Device().create_graph_builder().begin_building() # Add Node A (sets condition) - handle = gb.create_conditional_handle() - launch(gb, LaunchConfig(grid=1, block=1), set_handle, handle, False) + condition = gb.create_condition() + launch(gb, LaunchConfig(grid=1, block=1), set_handle, condition, False) # Add Node B (if condition) - gb_if = gb.if_cond(handle).begin_building() + gb_if = gb.if_then(condition).begin_building() launch(gb_if, LaunchConfig(grid=1, block=1), empty_kernel) gb_if_0, gb_if_1 = gb_if.split(2) launch(gb_if_0, LaunchConfig(grid=1, block=1), empty_kernel) diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index a607f897b90..f82164ca37c 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -85,7 +85,7 @@ def PARENT_ACTION(self, queue): mr2 = DeviceMemoryResource(self.device, options=options) self._extra_mrs.append(mr2) buffer = mr2.allocate(NBYTES) - queue.put([self.mr, buffer.get_ipc_descriptor()]) # Note: mr does not own this buffer + queue.put([self.mr, buffer.ipc_descriptor]) # Note: mr does not own this buffer def CHILD_ACTION(self, queue): mr, buffer_desc = queue.get(timeout=CHILD_TIMEOUT_SEC) diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 61fa7ca8536..e1bb45efcfb 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -114,7 +114,7 @@ def test_event_is_monadic(ipc_device): @pytest.mark.flaky(reruns=2) @pytest.mark.parametrize( - "options", [{"ipc_enabled": True, "enable_timing": True}, EventOptions(ipc_enabled=True, enable_timing=True)] + "options", [{"ipc_enabled": True, "timing_enabled": True}, EventOptions(ipc_enabled=True, timing_enabled=True)] ) def test_event_timing_disabled(ipc_device, options): """Check that IPC-enabled events cannot be created with timing enabled.""" @@ -131,10 +131,10 @@ class TestIpcEventProperties: """ @pytest.mark.flaky(reruns=2) - @pytest.mark.parametrize("busy_waited_sync", [True, False]) + @pytest.mark.parametrize("blocking_sync", [True, False]) @pytest.mark.parametrize("use_options_cls", [True, False]) @pytest.mark.parametrize("use_option_kw", [True, False]) - def test_main(self, ipc_device, busy_waited_sync, use_options_cls, use_option_kw): + def test_main(self, ipc_device, blocking_sync, use_options_cls, use_option_kw): device = ipc_device stream = device.create_stream() @@ -145,19 +145,19 @@ def test_main(self, ipc_device, busy_waited_sync, use_options_cls, use_option_kw # Create an event and send it. options = ( - EventOptions(ipc_enabled=True, busy_waited_sync=busy_waited_sync) + EventOptions(ipc_enabled=True, blocking_sync=blocking_sync) if use_options_cls - else {"ipc_enabled": True, "busy_waited_sync": busy_waited_sync} + else {"ipc_enabled": True, "blocking_sync": blocking_sync} ) e = stream.record(options=options) if use_option_kw else stream.record(None, options) q_out.put(e) # Check its properties. props = q_in.get(timeout=CHILD_TIMEOUT_SEC) - assert props[0] == e.get_ipc_descriptor() + assert props[0] == e.ipc_descriptor assert props[1] == e.is_ipc_enabled - assert props[2] == e.is_timing_disabled - assert props[3] == e.is_sync_busy_waited + assert props[2] == e.is_timing_enabled + assert props[3] == e.is_blocking_sync assert props[4] is None assert props[5] is None @@ -173,10 +173,10 @@ def child_main(self, q_in, q_out): # Send its properties. props = ( - e.get_ipc_descriptor(), + e.ipc_descriptor, e.is_ipc_enabled, - e.is_timing_disabled, - e.is_sync_busy_waited, + e.is_timing_enabled, + e.is_blocking_sync, e.device, e.context, ) diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py index 0808a191dd7..cab6b44aa3f 100644 --- a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -79,8 +79,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Send the memory resource and buffer descriptor twice. log("sending mr and buffer descriptors") queue.put(mr) - queue.put(buffer.get_ipc_descriptor()) - queue.put(buffer.get_ipc_descriptor()) + queue.put(buffer.ipc_descriptor) + queue.put(buffer.ipc_descriptor) log("waiting for child") process.join(timeout=CHILD_TIMEOUT_SEC) diff --git a/cuda_core/tests/memory_ipc/test_leaks.py b/cuda_core/tests/memory_ipc/test_leaks.py index b89f198699a..8debd71c3f9 100644 --- a/cuda_core/tests/memory_ipc/test_leaks.py +++ b/cuda_core/tests/memory_ipc/test_leaks.py @@ -26,10 +26,10 @@ @pytest.mark.flaky(reruns=2) @skip_if_unrunnable def test_alloc_handle(ipc_memory_resource): - """Check for fd leaks in get_allocation_handle.""" + """Check for fd leaks in allocation_handle.""" mr = ipc_memory_resource with CheckFDLeaks(): - [mr.get_allocation_handle() for _ in range(10)] + [mr.allocation_handle for _ in range(10)] def exec_success(obj, number=1): @@ -84,10 +84,10 @@ def __reduce__(self): @pytest.mark.parametrize( "getobject", [ - lambda mr: mr.get_allocation_handle(), + lambda mr: mr.allocation_handle, lambda mr: mr, lambda mr: mr.allocate(NBYTES), - lambda mr: mr.allocate(NBYTES).get_ipc_descriptor(), + lambda mr: mr.allocate(NBYTES).ipc_descriptor, ], ids=["alloc_handle", "mr", "buffer", "buffer_desc"], ) diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 1b320fa6f2c..0996c71d2cc 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -117,7 +117,7 @@ def test_main(self, ipc_device, ipc_memory_resource): # Set up the IPC-enabled memory pool and share it using one handle. device = ipc_device mr = ipc_memory_resource - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle # Start children. q1, q2 = (mp.Queue() for _ in range(2)) @@ -129,8 +129,8 @@ def test_main(self, ipc_device, ipc_memory_resource): # Allocate and share memory. buffer1 = mr.allocate(NBYTES) buffer2 = mr.allocate(NBYTES) - q1.put(buffer1.get_ipc_descriptor()) - q2.put(buffer2.get_ipc_descriptor()) + q1.put(buffer1.ipc_descriptor) + q2.put(buffer2.ipc_descriptor) # Wait for children. p1.join(timeout=CHILD_TIMEOUT_SEC) @@ -167,7 +167,7 @@ def test_main(self, ipc_device, ipc_memory_resource): """ device = ipc_device mr = ipc_memory_resource - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle # Start children. q1, q2 = (mp.Queue() for _ in range(2)) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index bd6a880fdc0..63e6ccf1dfd 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -34,7 +34,7 @@ def test_main(self, ipc_device, ipc_memory_resource): process.start() # Send a memory resource by allocation handle. - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle mp.reduction.send_handle(parent_conn, alloc_handle.handle, process.pid) # Send a buffer. @@ -42,7 +42,7 @@ def test_main(self, ipc_device, ipc_memory_resource): parent_conn.send(buffer1) # directly buffer2 = mr.allocate(NBYTES) - parent_conn.send(buffer2.get_ipc_descriptor()) # by descriptor + parent_conn.send(buffer2.ipc_descriptor) # by descriptor # Wait for the child process. process.join(timeout=CHILD_TIMEOUT_SEC) @@ -140,9 +140,9 @@ def test_main(self, ipc_device, ipc_memory_resource): # Define the objects. device = ipc_device mr = ipc_memory_resource - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle buffer = mr.allocate(NBYTES) - buffer_desc = buffer.get_ipc_descriptor() + buffer_desc = buffer.ipc_descriptor pgen = PatternGen(device, NBYTES) pgen.fill_buffer(buffer, seed=False) diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index 1fa235a4c97..cfaa776ac9e 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -82,7 +82,7 @@ def test_main(self, ipc_device, nmrs): with mp.Pool(NWORKERS, initializer=self.init_worker, initargs=(mrs,)) as pool: pool.starmap( self.process_buffer, - [(mrs.index(buffer.memory_resource), buffer.get_ipc_descriptor()) for buffer in buffers], + [(mrs.index(buffer.memory_resource), buffer.ipc_descriptor) for buffer in buffers], ) pgen = PatternGen(device, NBYTES) diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 9c1b0d1e28f..4870c5081e7 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -23,7 +23,7 @@ def test_event_init_disabled(): @pytest.mark.skipif(Device().compute_capability.major < 7, reason="__nanosleep is only available starting Volta (sm70)") def test_timing_success(init_cuda): - options = EventOptions(enable_timing=True) + options = EventOptions(timing_enabled=True) device = Device() stream = device.create_stream() @@ -48,20 +48,20 @@ def test_timing_success(init_cuda): assert elapsed_time_ms > 10 -def test_is_sync_busy_waited(init_cuda): - options = EventOptions(enable_timing=False, busy_waited_sync=True) +def test_is_blocking_sync(init_cuda): + options = EventOptions(timing_enabled=False, blocking_sync=True) stream = Device().create_stream() event = stream.record(options=options) - assert event.is_sync_busy_waited is True + assert event.is_blocking_sync is True - options = EventOptions(enable_timing=False) + options = EventOptions(timing_enabled=False) stream = Device().create_stream() event = stream.record(options=options) - assert event.is_sync_busy_waited is False + assert event.is_blocking_sync is False def test_sync(init_cuda): - options = EventOptions(enable_timing=False) + options = EventOptions(timing_enabled=False) stream = Device().create_stream() event = stream.record(options=options) event.sync() @@ -69,7 +69,7 @@ def test_sync(init_cuda): def test_is_done(init_cuda): - options = EventOptions(enable_timing=False) + options = EventOptions(timing_enabled=False) stream = Device().create_stream() event = stream.record(options=options) # Without a sync, the captured work might not have yet completed @@ -80,14 +80,14 @@ def test_is_done(init_cuda): def test_error_timing_disabled(): device = Device() device.set_current() - enabled = EventOptions(enable_timing=True) - disabled = EventOptions(enable_timing=False) + enabled = EventOptions(timing_enabled=True) + disabled = EventOptions(timing_enabled=False) stream = device.create_stream() event1 = stream.record(options=enabled) event2 = stream.record(options=disabled) - assert not event1.is_timing_disabled - assert event2.is_timing_disabled + assert event1.is_timing_enabled + assert not event2.is_timing_enabled stream.sync() with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"): event2 - event1 @@ -102,7 +102,7 @@ def test_error_timing_disabled(): def test_error_timing_recorded(): device = Device() device.set_current() - enabled = EventOptions(enable_timing=True) + enabled = EventOptions(timing_enabled=True) stream = device.create_stream() event1 = stream.record(options=enabled) @@ -123,7 +123,7 @@ def test_error_timing_incomplete(): device = Device() device.set_current() latch = LatchKernel(device) - enabled = EventOptions(enable_timing=True) + enabled = EventOptions(timing_enabled=True) stream = device.create_stream() event1 = stream.record(options=enabled) @@ -213,13 +213,13 @@ def test_event_rsub_not_implemented(init_cuda): assert result is NotImplemented -def test_event_get_ipc_descriptor_non_ipc(init_cuda): - """get_ipc_descriptor raises RuntimeError on a non-IPC event.""" +def test_event_ipc_descriptor_non_ipc(init_cuda): + """ipc_descriptor raises RuntimeError on a non-IPC event.""" device = Device() stream = device.create_stream() event = stream.record() with pytest.raises(RuntimeError, match="not IPC-enabled"): - event.get_ipc_descriptor() + _ = event.ipc_descriptor def test_event_is_done_false(init_cuda): diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 7bd480edf23..b3461f5a371 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -262,7 +262,7 @@ def test_cooperative_launch(): prog = Program(code, code_type="c++", options=pro_opts) ker = prog.compile("cubin").get_kernel("test_grid_sync") - # # Launch without setting cooperative_launch + # # Launch without setting is_cooperative # # Commented out as this seems to be a sticky error... # config = LaunchConfig(grid=1, block=1) # launch(s, config, ker) @@ -273,12 +273,12 @@ def test_cooperative_launch(): # Crazy grid sizes would not work block = 128 - config = LaunchConfig(grid=dev.properties.max_grid_dim_x // block + 1, block=block, cooperative_launch=True) + config = LaunchConfig(grid=dev.properties.max_grid_dim_x // block + 1, block=block, is_cooperative=True) with pytest.raises(ValueError): launch(s, config, ker) # This works just fine - config = LaunchConfig(grid=1, block=1, cooperative_launch=True) + config = LaunchConfig(grid=1, block=1, is_cooperative=True) launch(s, config, ker) s.sync() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 57de22bb9a0..85dd4a7ea2b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1223,10 +1223,10 @@ def test_mempool_ipc_errors(mempool_device): ipc_error_msg = "Memory resource is not IPC-enabled" with pytest.raises(RuntimeError, match=ipc_error_msg): - mr.get_allocation_handle() + _ = mr.allocation_handle with pytest.raises(RuntimeError, match=ipc_error_msg): - buffer.get_ipc_descriptor() + _ = buffer.ipc_descriptor with pytest.raises(RuntimeError, match=ipc_error_msg): handle = IPCBufferDescriptor._init(b"", 0) @@ -1258,7 +1258,7 @@ def test_pinned_mempool_ipc_basic(): assert mr.numa_id >= 0 # IPC requires a concrete NUMA node # Test allocation handle export - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle assert alloc_handle is not None # Test buffer allocation @@ -1268,7 +1268,7 @@ def test_pinned_mempool_ipc_basic(): assert buffer.is_host_accessible # Test IPC descriptor - ipc_desc = buffer.get_ipc_descriptor() + ipc_desc = buffer.ipc_descriptor assert ipc_desc is not None assert ipc_desc.size == 1024 @@ -1294,10 +1294,10 @@ def test_pinned_mempool_ipc_errors(): ipc_error_msg = "Memory resource is not IPC-enabled" with pytest.raises(RuntimeError, match=ipc_error_msg): - mr.get_allocation_handle() + _ = mr.allocation_handle with pytest.raises(RuntimeError, match=ipc_error_msg): - buffer.get_ipc_descriptor() + _ = buffer.ipc_descriptor with pytest.raises(RuntimeError, match=ipc_error_msg): handle = IPCBufferDescriptor._init(b"", 0) diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index d3ffa0ca2b6..58f09564971 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -57,6 +57,31 @@ def test_kernel_attributes_init_disabled(): cuda.core._module.KernelAttributes() # Ensure back door is locked. +def test_kernel_attributes_per_device_view(get_saxpy_kernel_cubin): + """kernel.attributes[device] returns a per-device view; values match.""" + kernel, _ = get_saxpy_kernel_cubin + dev = Device() + + default_view = kernel.attributes + int_view = kernel.attributes[dev.device_id] + dev_view = kernel.attributes[dev] + + # Same value via every access path (default view = current device). + assert default_view.num_regs == int_view.num_regs == dev_view.num_regs + assert default_view.max_threads_per_block == int_view.max_threads_per_block + + # The bound views are distinct objects from the default view. + assert int_view is not default_view + assert int_view is not dev_view + + +def test_kernel_attributes_indexing_rejects_invalid_device(get_saxpy_kernel_cubin): + """kernel.attributes[bad] raises through the Device(...) constructor.""" + kernel, _ = get_saxpy_kernel_cubin + with pytest.raises((TypeError, ValueError, OverflowError)): + kernel.attributes["not a device"] + + def test_kernel_occupancy_init_disabled(): with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."): cuda.core._module.KernelOccupancy() # Ensure back door is locked. @@ -182,16 +207,18 @@ def test_get_kernel(init_cuda): ) def test_read_only_kernel_attributes(get_saxpy_kernel_cubin, attr, expected_type): kernel, _ = get_saxpy_kernel_cubin - method = getattr(kernel.attributes, attr) - # get the value without providing a device ordinal - value = method() + + # Default view: property access on the current-device view. + value = getattr(kernel.attributes, attr) assert value is not None + assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}" - # get the value for each device on the system, using either the device object or ordinal + # Per-device views via __getitem__: each device, both Device and ordinal forms. for device in Device.get_all_devices(): - value = method(device) - value = method(device.device_id) - assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}" + value = getattr(kernel.attributes[device], attr) + assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}" + value = getattr(kernel.attributes[device.device_id], attr) + assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}" def test_object_code_load_ptx(get_saxpy_kernel_ptx): @@ -384,7 +411,7 @@ def test_occupancy_max_active_block_per_multiprocessor(get_saxpy_kernel_cubin, b kernel_smem_size_per_sm = num_blocks_per_sm * smem_size_per_block assert kernel_threads_per_sm <= dev_props.max_threads_per_multiprocessor assert kernel_smem_size_per_sm <= dev_props.max_shared_memory_per_multiprocessor - assert kernel.attributes.num_regs() * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor + assert kernel.attributes.num_regs * num_blocks_per_sm <= dev_props.max_registers_per_multiprocessor @pytest.mark.parametrize("block_size_limit", [32, 64, 96, 120, 128, 256, 0]) @@ -506,7 +533,7 @@ def test_kernel_from_handle(get_saxpy_kernel_cubin): assert isinstance(kernel_from_handle, Kernel) # Verify we can access kernel attributes - max_threads = kernel_from_handle.attributes.max_threads_per_block() + max_threads = kernel_from_handle.attributes.max_threads_per_block assert isinstance(max_threads, int) assert max_threads > 0 @@ -524,7 +551,7 @@ def test_kernel_from_handle_no_module(get_saxpy_kernel_cubin): assert isinstance(kernel_from_handle, Kernel) # Verify we can still access kernel attributes - max_threads = kernel_from_handle.attributes.max_threads_per_block() + max_threads = kernel_from_handle.attributes.max_threads_per_block assert isinstance(max_threads, int) assert max_threads > 0 @@ -599,7 +626,7 @@ def test_kernel_from_handle_library_mismatch_warning(init_cuda): assert len(w) == 1 assert "does not match" in str(w[0].message) - assert k.attributes.max_threads_per_block() > 0 + assert k.attributes.max_threads_per_block > 0 def test_kernel_from_handle_foreign_kernel(init_cuda): @@ -615,7 +642,7 @@ def test_kernel_from_handle_foreign_kernel(init_cuda): handle = int(cu_kernel) k = Kernel.from_handle(handle) - assert k.attributes.max_threads_per_block() > 0 + assert k.attributes.max_threads_per_block > 0 def test_kernel_keeps_library_alive(init_cuda): diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py index 0eb47daaee4..0f96e0abfbc 100644 --- a/cuda_core/tests/test_multiprocessing_warning.py +++ b/cuda_core/tests/test_multiprocessing_warning.py @@ -52,7 +52,7 @@ def test_warn_on_fork_method_allocation_handle(ipc_device): device.set_current() options = DeviceMemoryResourceOptions(max_size=2097152, ipc_enabled=True) mr = DeviceMemoryResource(device, options=options) - alloc_handle = mr.get_allocation_handle() + alloc_handle = mr.allocation_handle with patch("multiprocessing.get_start_method", return_value="fork"), warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") diff --git a/cuda_core/tests/test_object_protocols.py b/cuda_core/tests/test_object_protocols.py index b9f396310d7..457debc0903 100644 --- a/cuda_core/tests/test_object_protocols.py +++ b/cuda_core/tests/test_object_protocols.py @@ -232,7 +232,7 @@ def sample_ipc_buffer_descriptor(ipc_device): options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True) mr = DeviceMemoryResource(ipc_device, options=options) buf = mr.allocate(64) - return buf.get_ipc_descriptor() + return buf.ipc_descriptor @pytest.fixture @@ -240,7 +240,7 @@ def sample_ipc_event_descriptor(ipc_device): """An IPCEventDescriptor.""" stream = ipc_device.create_stream() e = stream.record(options={"ipc_enabled": True}) - return e.get_ipc_descriptor() + return e.ipc_descriptor # ============================================================================= @@ -278,8 +278,8 @@ def sample_root_node_alt(sample_graphdef_alt): def sample_empty_node(sample_graphdef): """An EmptyNode created by merging two branches.""" _skip_if_no_mempool() - a = sample_graphdef.alloc(ALLOC_SIZE) - b = sample_graphdef.alloc(ALLOC_SIZE) + a = sample_graphdef.allocate(ALLOC_SIZE) + b = sample_graphdef.allocate(ALLOC_SIZE) return sample_graphdef.join(a, b) @@ -287,8 +287,8 @@ def sample_empty_node(sample_graphdef): def sample_empty_node_alt(sample_graphdef): """An alternate EmptyNode from same graph.""" _skip_if_no_mempool() - c = sample_graphdef.alloc(ALLOC_SIZE) - d = sample_graphdef.alloc(ALLOC_SIZE) + c = sample_graphdef.allocate(ALLOC_SIZE) + d = sample_graphdef.allocate(ALLOC_SIZE) return sample_graphdef.join(c, d) @@ -296,14 +296,14 @@ def sample_empty_node_alt(sample_graphdef): def sample_alloc_node(sample_graphdef): """An AllocNode.""" _skip_if_no_mempool() - return sample_graphdef.alloc(ALLOC_SIZE) + return sample_graphdef.allocate(ALLOC_SIZE) @pytest.fixture def sample_alloc_node_alt(sample_graphdef): """An alternate AllocNode from same graph.""" _skip_if_no_mempool() - return sample_graphdef.alloc(ALLOC_SIZE) + return sample_graphdef.allocate(ALLOC_SIZE) @pytest.fixture @@ -328,23 +328,23 @@ def sample_kernel_node_alt(sample_graphdef, init_cuda): def sample_free_node(sample_graphdef): """A FreeNode.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) - return alloc.free(alloc.dptr) + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.deallocate(alloc.dptr) @pytest.fixture def sample_free_node_alt(sample_graphdef): """An alternate FreeNode from same graph.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) - return alloc.free(alloc.dptr) + alloc = sample_graphdef.allocate(ALLOC_SIZE) + return alloc.deallocate(alloc.dptr) @pytest.fixture def sample_memset_node(sample_graphdef): """A MemsetNode.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) + alloc = sample_graphdef.allocate(ALLOC_SIZE) return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @@ -352,7 +352,7 @@ def sample_memset_node(sample_graphdef): def sample_memset_node_alt(sample_graphdef): """An alternate MemsetNode from same graph.""" _skip_if_no_mempool() - alloc = sample_graphdef.alloc(ALLOC_SIZE) + alloc = sample_graphdef.allocate(ALLOC_SIZE) return alloc.memset(alloc.dptr, 0, ALLOC_SIZE) @@ -360,8 +360,8 @@ def sample_memset_node_alt(sample_graphdef): def sample_memcpy_node(sample_graphdef): """A MemcpyNode.""" _skip_if_no_mempool() - src = sample_graphdef.alloc(ALLOC_SIZE) - dst = sample_graphdef.alloc(ALLOC_SIZE) + src = sample_graphdef.allocate(ALLOC_SIZE) + dst = sample_graphdef.allocate(ALLOC_SIZE) dep = sample_graphdef.join(src, dst) return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) @@ -370,8 +370,8 @@ def sample_memcpy_node(sample_graphdef): def sample_memcpy_node_alt(sample_graphdef): """An alternate MemcpyNode from same graph.""" _skip_if_no_mempool() - src = sample_graphdef.alloc(ALLOC_SIZE) - dst = sample_graphdef.alloc(ALLOC_SIZE) + src = sample_graphdef.allocate(ALLOC_SIZE) + dst = sample_graphdef.allocate(ALLOC_SIZE) dep = sample_graphdef.join(src, dst) return dep.memcpy(dst.dptr, src.dptr, ALLOC_SIZE) @@ -400,28 +400,28 @@ def sample_child_graph_node_alt(sample_graphdef): def sample_event_record_node(sample_graphdef, sample_device): """An EventRecordNode.""" event = sample_device.create_event() - return sample_graphdef.record_event(event) + return sample_graphdef.record(event) @pytest.fixture def sample_event_record_node_alt(sample_graphdef, sample_device): """An alternate EventRecordNode from same graph.""" event = sample_device.create_event() - return sample_graphdef.record_event(event) + return sample_graphdef.record(event) @pytest.fixture def sample_event_wait_node(sample_graphdef, sample_device): """An EventWaitNode.""" event = sample_device.create_event() - return sample_graphdef.wait_event(event) + return sample_graphdef.wait(event) @pytest.fixture def sample_event_wait_node_alt(sample_graphdef, sample_device): """An alternate EventWaitNode from same graph.""" event = sample_device.create_event() - return sample_graphdef.wait_event(event) + return sample_graphdef.wait(event) @pytest.fixture @@ -460,14 +460,14 @@ def sample_condition_alt(sample_graphdef): def sample_if_node(sample_graphdef): """An IfNode.""" condition = try_create_condition(sample_graphdef) - return sample_graphdef.if_cond(condition) + return sample_graphdef.if_then(condition) @pytest.fixture def sample_if_node_alt(sample_graphdef): """An alternate IfNode from same graph.""" condition = try_create_condition(sample_graphdef) - return sample_graphdef.if_cond(condition) + return sample_graphdef.if_then(condition) @pytest.fixture @@ -670,7 +670,7 @@ def sample_switch_node_alt(sample_graphdef): ( "sample_launch_config", r"LaunchConfig\(grid=\(\d+, \d+, \d+\), cluster=.+, block=\(\d+, \d+, \d+\), " - r"shmem_size=\d+, cooperative_launch=(?:True|False)\)", + r"shmem_size=\d+, is_cooperative=(?:True|False)\)", ), ("sample_kernel", r""), # ObjectCode variations (by code_type)