Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cuda_core/cuda/core/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,29 @@ class Device:
total = system.get_num_devices()
return tuple(cls(device_id) for device_id in range(total))

def to_system_device(self) -> 'cuda.core.system.Device':
"""
Get the corresponding :class:`cuda.core.system.Device` (which is used
for NVIDIA Machine Library (NVML) access) for this
:class:`cuda.core.Device` (which is used for CUDA access).

The devices are mapped to one another by their UUID.

Returns
-------
cuda.core.system.Device
The corresponding system-level device instance used for NVML access.
"""
from cuda.core.system._system import CUDA_BINDINGS_NVML_IS_COMPATIBLE

if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
raise RuntimeError(
"cuda.core.system.Device requires cuda_bindings 13.1.2+ or 12.9.6+"
)

from cuda.core.system import Device as SystemDevice
return SystemDevice(uuid=self.uuid)

@property
def device_id(self) -> int:
"""Return device ordinal."""
Expand Down
34 changes: 33 additions & 1 deletion cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,30 @@ cdef class Device:
pci_bus_id = pci_bus_id.decode("ascii")
self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)

def to_cuda_device(self) -> "cuda.core.Device":
"""
Get the corresponding :class:`cuda.core.Device` (which is used for CUDA
access) for this :class:`cuda.core.system.Device` (which is used for
NVIDIA machine library (NVML) access).

The devices are mapped to one another by their UUID.

Returns
-------
cuda.core.Device
The corresponding CUDA device.
"""
from cuda.core import Device as CudaDevice

# CUDA does not have an API to get a device by its UUID, so we just
# search all the devices for one with a matching UUID.

for cuda_device in CudaDevice.get_all_devices():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider: Can we memoize this call so it only does the linear search once and caches the result?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably in most cases that would be fine, but hot plugging devices is possible, so the device with the same UUID can change device handles after a replug. Hot plugging a device during a running process that cares about it probably has all kinds of other problems, but since we can't predict how the application works, it's probably better putting caching in the hands of the user of this API.

if cuda_device.uuid == self.uuid:
return cuda_device

raise RuntimeError("No corresponding CUDA device found for this NVML device.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More of a rant I guess than anything: I know this is pre-existing, but I really dislike that we raise RuntimeError everywhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be more appropriate here? A custom exception along the lines of DeviceNotFoundError?


@classmethod
def get_device_count(cls) -> int:
"""
Expand Down Expand Up @@ -1036,8 +1060,16 @@ cdef class Device:
Retrieves the globally unique immutable UUID associated with this
device, as a 5 part hexadecimal string, that augments the immutable,
board serial identifier.

In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-``
prefix. That is not included in ``cuda.core.system``.
"""
return nvml.device_get_uuid(self._handle)
# NVML UUIDs have a `GPU-` or `MIG-` prefix. We remove that here.

# TODO: If the user cares about the prefix, we will expose that in the
# future using the MIG-related APIs in NVML.

return nvml.device_get_uuid(self._handle)[4:]
Comment on lines +1069 to +1072
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am thinking the system device should return the full UUID and we document the different expectations between cuda.core and cuda.core.system (or CUDA vs NVML). @mdboom thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I totally can see it both ways. My original implementation did was you suggested (following upstream NVML behavior). But @cpcloud convinced me this is weird -- UUID has a well-defined meaning in our field that NVML deviates from. I don't feel super strongly either way -- we just need to break the tie ;)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC NVML is the only way for us to tell, from inside a running process, if we are using MIG instances or otherwise (bare-metal GPU, MPS, etc). CUDA purposely hides MIG from end users. So my thinking is if we don't follow NVML there is no other way for Python users to query. Could you check if my impression is correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is another API, nvmlDeviceIsMigDeviceHandle, that could be used to query whether it's MIG, and IMHO, that's better than the user needing to parse a string to get that info.

/**
 * Test if the given handle refers to a MIG device.
 *
 * A MIG device handle is an NVML abstraction which maps to a MIG compute instance.
 * These overloaded references can be used (with some restrictions) interchangeably
 * with a GPU device handle to execute queries at a per-compute instance granularity.
 *
 * For Ampere &tm; or newer fully supported devices.
 * Supported on Linux only.
 *
 * @param device                               NVML handle to test
 * @param isMigDevice                          True when handle refers to a MIG device
 */
nvmlReturn_t DECLDIR nvmlDeviceIsMigDeviceHandle(nvmlDevice_t device, unsigned int *isMigDevice);


def register_events(self, events: EventType | int | list[EventType | int]) -> DeviceEvents:
"""
Expand Down
17 changes: 17 additions & 0 deletions cuda_core/tests/system/test_system_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,23 @@ def test_device_count():
assert system.Device.get_device_count() == system.get_num_devices()


def test_to_cuda_device():
from cuda.core import Device as CudaDevice

for device in system.Device.get_all_devices():
cuda_device = device.to_cuda_device()

assert isinstance(cuda_device, CudaDevice)
assert cuda_device.uuid == device.uuid

# Technically, this test will only work with PCI devices, but are there
# non-PCI devices we need to support?

# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
# 4-byte domain
assert cuda_device.pci_bus_id == device.pci_info.bus_id[4:]


def test_device_architecture():
for device in system.Device.get_all_devices():
device_arch = device.architecture
Expand Down
24 changes: 24 additions & 0 deletions cuda_core/tests/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,30 @@ def cuda_version():
return _py_major_ver, _driver_ver


def test_to_system_device(deinit_cuda):
from cuda.core.system import _system

device = Device()

if not _system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
with pytest.raises(RuntimeError):
device.to_system_device()
pytest.skip("NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+")

from cuda.core.system import Device as SystemDevice

system_device = device.to_system_device()
assert isinstance(system_device, SystemDevice)
assert system_device.uuid == device.uuid

# Technically, this test will only work with PCI devices, but are there
# non-PCI devices we need to support?

# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
# 4-byte domain
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]


def test_device_set_current(deinit_cuda):
device = Device()
device.set_current()
Expand Down
Loading