Posted on

Table of Contents

TLDR you can use this example repository.

Compiling Rust to PTX

[lib]
crate-type = ["cdylib"]
#![no_std]
#![feature(abi_ptx, core_intrinsics)]
#![feature(asm_experimental_arch)]

use core::arch::asm;

#[no_mangle]
/// Actual function called by the CPU code in the GPU
pub unsafe extern "ptx-kernel" fn my_kernel(
    input: *mut u32,
    input_len: usize,
    output: *mut u32,
    output_len: usize,
) {
    safe_kernel(
        core::slice::from_raw_parts_mut(input, input_len),
        core::slice::from_raw_parts_mut(output, output_len),
    )
}

fn safe_kernel(input: &mut[u8], output: &mut[u8]) {
    let idx = (block_idx_x() * 1024 + thread_idx_x()) as isize;

    output[idx] = clock();
}
#[panic_handler]
pub unsafe fn breakpoint_panic_handler(_: &::core::panic::PanicInfo) -> ! {
    core::intrinsics::breakpoint();
    core::hint::unreachable_unchecked();
}   

Then we can manually create our intrinsics

#[inline(always)]
pub fn clock() -> u32{
    let mut result: u32;
    unsafe {
        asm!(
            "mov.u32 {output}, %clock;",
            output = out(reg32) result,
        );
    }
    result
}

#[inline(always)]
pub fn thread_idx_x() -> u32{
    let mut result: u32;
    unsafe {
        asm!(
            "mov.u32 {r}, %tid.x;",
            r = out(reg32) result,
        );
    }
    result
}

#[inline(always)]
pub fn block_idx_x() -> u32{
    let mut result: u32;
    unsafe {
        asm!(
            "mov.u32 {r}, %ctaid.x;",
            r = out(reg32) result,
        );
    }
    result
}

Loading and executing a PTX

[dependencies]
cuda-driver-sys="0.3.0"
cuda-runtime-sys="0.3.0-alpha.1"
use cuda_driver_sys::*;
use cuda_runtime_sys::*;
use std::ffi::{CString, c_void};

/// The path to the PTX generated by the GPU code
const PTX_PATH: &str = "../gpu_code/target/nvptx64-nvidia-cuda/release/gpu_code.ptx";

/// allocate a buffer in the device
unsafe fn allocate<T>(size: usize) -> CUdeviceptr {
    let mut dptr: CUdeviceptr = 0;
    let error = cuMemAlloc_v2(
        &mut dptr as *mut CUdeviceptr, 
        size * core::mem::size_of::<T>()
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);
    dptr
}
fn main() {unsafe{
    // Init the cuda library
    cuInit(0);

    // Get the first available device
    let mut device: CUdevice = 0;
    let error = cuDeviceGet(&mut device as *mut CUdevice, 0);    
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // create a context
    let mut context: CUcontext = core::ptr::null_mut();
    let error = cuCtxCreate_v2(
        &mut context as *mut CUcontext, 
        cudaDeviceScheduleAuto, 
        device
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // Load the PTX file
    let mut module: CUmodule = core::ptr::null_mut();
    let file_name = CString::new(PTX_PATH).unwrap();
    let error = cuModuleLoad(
        &mut module as *mut CUmodule,
        file_name.as_ptr(),
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // Create a stream
    let mut stream = core::mem::MaybeUninit::uninit().assume_init();
    let error = cuStreamCreate(
        &mut stream as *mut CUstream, 
        0,
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // allocate the results buffer in the device
    let mut input_len = 4096 / core::mem::size_of::<u32>();
    let mut inputs = allocate::<u32>(input_len); 
    let mut output_len = 1024 * 1024;
    let mut outputs = allocate::<u32>(output_len); 

    // get the kernel function to call
    let func_name = CString::new("my_kernel").unwrap();
    let mut func: CUfunction = core::ptr::null_mut();
    let error = cuModuleGetFunction(
        &mut func as *mut CUfunction, 
        module, 
        func_name.as_ptr(),
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // Run the kernel
    let mut args = vec![
        &mut inputs     as *mut _ as *mut c_void, 
        &mut input_len  as *mut _ as *mut c_void,
        &mut outputs    as *mut _ as *mut c_void, 
        &mut output_len as *mut _ as *mut c_void,
    ];
    let error = cuLaunchKernel(
        func, 
        1024,
        1,
        1,
        1024,
        1,
        1,
        0,
        stream,
        args.as_mut_ptr(),
        core::ptr::null_mut(),
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // wait for the gpu to finish
    let error = cuStreamSynchronize(stream);
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // Copy back the results from the device to the host
    let mut result_buffer = vec![0_u32; output_len];
    let error = cuMemcpyDtoH_v2(
        result_buffer.as_mut_ptr() as _,
        outputs,
        output_len * core::mem::size_of::<u32>(),
    );
    assert_eq!(error, cudaError_enum::CUDA_SUCCESS);

    // Print the results
    println!("{:?}", result_buffer);
}}

Putting it together# add the toolchain with:

# Add the target to compile for PTX
rustup target add nvptx64-nvidia-cuda

# install the linker
cargo install ptx-linker -f --version ">= 0.9"

# Compile the PTX file
cargo build --release --manifest-path="gpu_code" --target="nvptx64-nvidia-cuda"
# Compile the cpu code, loading the compiled ptx
cargo run --release --manifest-path="cpu_code"