CUDA kernels in Rust (Draft)
Table of Contents
TLDR you can use this example repository.
Compiling Rust to PTX
[lib]
crate-type = ["cdylib"]
#![no_std]
#![feature(abi_ptx, core_intrinsics)]
#![feature(asm_experimental_arch)]
use core::arch::asm;
#[no_mangle]
/// Actual function called by the CPU code in the GPU
pub unsafe extern "ptx-kernel" fn my_kernel(
input: *mut u32,
input_len: usize,
output: *mut u32,
output_len: usize,
) {
safe_kernel(
core::slice::from_raw_parts_mut(input, input_len),
core::slice::from_raw_parts_mut(output, output_len),
)
}
fn safe_kernel(input: &mut[u8], output: &mut[u8]) {
let idx = (block_idx_x() * 1024 + thread_idx_x()) as isize;
output[idx] = clock();
}
#[panic_handler]
pub unsafe fn breakpoint_panic_handler(_: &::core::panic::PanicInfo) -> ! {
core::intrinsics::breakpoint();
core::hint::unreachable_unchecked();
}
Then we can manually create our intrinsics
#[inline(always)]
pub fn clock() -> u32{
let mut result: u32;
unsafe {
asm!(
"mov.u32 {output}, %clock;",
output = out(reg32) result,
);
}
result
}
#[inline(always)]
pub fn thread_idx_x() -> u32{
let mut result: u32;
unsafe {
asm!(
"mov.u32 {r}, %tid.x;",
r = out(reg32) result,
);
}
result
}
#[inline(always)]
pub fn block_idx_x() -> u32{
let mut result: u32;
unsafe {
asm!(
"mov.u32 {r}, %ctaid.x;",
r = out(reg32) result,
);
}
result
}
Loading and executing a PTX
[dependencies]
cuda-driver-sys="0.3.0"
cuda-runtime-sys="0.3.0-alpha.1"
use cuda_driver_sys::*;
use cuda_runtime_sys::*;
use std::ffi::{CString, c_void};
/// The path to the PTX generated by the GPU code
const PTX_PATH: &str = "../gpu_code/target/nvptx64-nvidia-cuda/release/gpu_code.ptx";
/// allocate a buffer in the device
unsafe fn allocate<T>(size: usize) -> CUdeviceptr {
let mut dptr: CUdeviceptr = 0;
let error = cuMemAlloc_v2(
&mut dptr as *mut CUdeviceptr,
size * core::mem::size_of::<T>()
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
dptr
}
fn main() {unsafe{
// Init the cuda library
cuInit(0);
// Get the first available device
let mut device: CUdevice = 0;
let error = cuDeviceGet(&mut device as *mut CUdevice, 0);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// create a context
let mut context: CUcontext = core::ptr::null_mut();
let error = cuCtxCreate_v2(
&mut context as *mut CUcontext,
cudaDeviceScheduleAuto,
device
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// Load the PTX file
let mut module: CUmodule = core::ptr::null_mut();
let file_name = CString::new(PTX_PATH).unwrap();
let error = cuModuleLoad(
&mut module as *mut CUmodule,
file_name.as_ptr(),
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// Create a stream
let mut stream = core::mem::MaybeUninit::uninit().assume_init();
let error = cuStreamCreate(
&mut stream as *mut CUstream,
0,
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// allocate the results buffer in the device
let mut input_len = 4096 / core::mem::size_of::<u32>();
let mut inputs = allocate::<u32>(input_len);
let mut output_len = 1024 * 1024;
let mut outputs = allocate::<u32>(output_len);
// get the kernel function to call
let func_name = CString::new("my_kernel").unwrap();
let mut func: CUfunction = core::ptr::null_mut();
let error = cuModuleGetFunction(
&mut func as *mut CUfunction,
module,
func_name.as_ptr(),
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// Run the kernel
let mut args = vec![
&mut inputs as *mut _ as *mut c_void,
&mut input_len as *mut _ as *mut c_void,
&mut outputs as *mut _ as *mut c_void,
&mut output_len as *mut _ as *mut c_void,
];
let error = cuLaunchKernel(
func,
1024,
1,
1,
1024,
1,
1,
0,
stream,
args.as_mut_ptr(),
core::ptr::null_mut(),
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// wait for the gpu to finish
let error = cuStreamSynchronize(stream);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// Copy back the results from the device to the host
let mut result_buffer = vec![0_u32; output_len];
let error = cuMemcpyDtoH_v2(
result_buffer.as_mut_ptr() as _,
outputs,
output_len * core::mem::size_of::<u32>(),
);
assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
// Print the results
println!("{:?}", result_buffer);
}}
Putting it together# add the toolchain with:
# Add the target to compile for PTX
rustup target add nvptx64-nvidia-cuda
# install the linker
cargo install ptx-linker -f --version ">= 0.9"
# Compile the PTX file
cargo build --release --manifest-path="gpu_code" --target="nvptx64-nvidia-cuda"
# Compile the cpu code, loading the compiled ptx
cargo run --release --manifest-path="cpu_code"