summaryrefslogtreecommitdiffstats
path: root/library/stdarch/crates/core_arch/src/nvptx/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'library/stdarch/crates/core_arch/src/nvptx/mod.rs')
-rw-r--r--library/stdarch/crates/core_arch/src/nvptx/mod.rs213
1 files changed, 213 insertions, 0 deletions
diff --git a/library/stdarch/crates/core_arch/src/nvptx/mod.rs b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
new file mode 100644
index 000000000..bf6673f47
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/nvptx/mod.rs
@@ -0,0 +1,213 @@
+//! NVPTX intrinsics (experimental)
+//!
+//! These intrinsics form the foundation of the CUDA
+//! programming model.
+//!
+//! The reference is the [CUDA C Programming Guide][cuda_c]. Relevant is also
+//! the [LLVM NVPTX Backend documentation][llvm_docs].
+//!
+//! [cuda_c]:
+//! http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+//! [llvm_docs]:
+//! https://llvm.org/docs/NVPTXUsage.html
+
+use crate::ffi::c_void;
+
+#[allow(improper_ctypes)]
+extern "C" {
+ #[link_name = "llvm.nvvm.barrier0"]
+ fn syncthreads() -> ();
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.x"]
+ fn block_dim_x() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.y"]
+ fn block_dim_y() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.z"]
+ fn block_dim_z() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.x"]
+ fn block_idx_x() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.y"]
+ fn block_idx_y() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.ctaid.z"]
+ fn block_idx_z() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.x"]
+ fn grid_dim_x() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.y"]
+ fn grid_dim_y() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.nctaid.z"]
+ fn grid_dim_z() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.tid.x"]
+ fn thread_idx_x() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.tid.y"]
+ fn thread_idx_y() -> i32;
+ #[link_name = "llvm.nvvm.read.ptx.sreg.tid.z"]
+ fn thread_idx_z() -> i32;
+}
+
+/// Synchronizes all threads in the block.
+#[inline]
+pub unsafe fn _syncthreads() -> () {
+ syncthreads()
+}
+
+/// x-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_x() -> i32 {
+ block_dim_x()
+}
+
+/// y-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_y() -> i32 {
+ block_dim_y()
+}
+
+/// z-th thread-block dimension.
+#[inline]
+pub unsafe fn _block_dim_z() -> i32 {
+ block_dim_z()
+}
+
+/// x-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_x() -> i32 {
+ block_idx_x()
+}
+
+/// y-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_y() -> i32 {
+ block_idx_y()
+}
+
+/// z-th thread-block index.
+#[inline]
+pub unsafe fn _block_idx_z() -> i32 {
+ block_idx_z()
+}
+
+/// x-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_x() -> i32 {
+ grid_dim_x()
+}
+
+/// y-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_y() -> i32 {
+ grid_dim_y()
+}
+
+/// z-th block-grid dimension.
+#[inline]
+pub unsafe fn _grid_dim_z() -> i32 {
+ grid_dim_z()
+}
+
+/// x-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_x() -> i32 {
+ thread_idx_x()
+}
+
+/// y-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_y() -> i32 {
+ thread_idx_y()
+}
+
+/// z-th thread index.
+#[inline]
+pub unsafe fn _thread_idx_z() -> i32 {
+ thread_idx_z()
+}
+
+/// Generates the trap instruction `TRAP`
+#[inline]
+pub unsafe fn trap() -> ! {
+ crate::intrinsics::abort()
+}
+
+// Basic CUDA syscall declarations.
+extern "C" {
+ /// Print formatted output from a kernel to a host-side output stream.
+ ///
+ /// Syscall arguments:
+ /// * `status`: The status value that is returned by `vprintf`.
+ /// * `format`: A pointer to the format specifier input (uses common `printf` format).
+ /// * `valist`: A pointer to the valist input.
+ ///
+ /// ```
+ /// #[repr(C)]
+ /// struct PrintArgs(f32, f32, f32, i32);
+ ///
+ /// vprintf(
+ /// "int(%f + %f) = int(%f) = %d\n".as_ptr(),
+ /// transmute(&PrintArgs(a, b, a + b, (a + b) as i32)),
+ /// );
+ /// ```
+ ///
+ /// Sources:
+ /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#formatted-output),
+ /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+ pub fn vprintf(format: *const u8, valist: *const c_void) -> i32;
+
+ /// Allocate memory dynamically from a fixed-size heap in global memory.
+ ///
+ /// The CUDA in-kernel `malloc()` function allocates at least `size` bytes
+ /// from the device heap and returns a pointer to the allocated memory
+ /// or `NULL` if insufficient memory exists to fulfill the request.
+ ///
+ /// The returned pointer is guaranteed to be aligned to a 16-byte boundary.
+ ///
+ /// The memory allocated by a given CUDA thread via `malloc()` remains allocated
+ /// for the lifetime of the CUDA context, or until it is explicitly released
+ /// by a call to `free()`. It can be used by any other CUDA threads
+ /// even from subsequent kernel launches.
+ ///
+ /// Sources:
+ /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+ /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+ // FIXME(denzp): assign `malloc` and `nothrow` attributes.
+ pub fn malloc(size: usize) -> *mut c_void;
+
+ /// Free previously dynamically allocated memory.
+ ///
+ /// The CUDA in-kernel `free()` function deallocates the memory pointed to by `ptr`,
+ /// which must have been returned by a previous call to `malloc()`. If `ptr` is NULL,
+ /// the call to `free()` is ignored.
+ ///
+ /// Any CUDA thread may free memory allocated by another thread, but care should be taken
+ /// to ensure that the same pointer is not freed more than once. Repeated calls to `free()`
+ /// with the same `ptr` has undefined behavior.
+ ///
+ /// Sources:
+ /// [Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dynamic-global-memory-allocation-and-operations),
+ /// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+ // FIXME(denzp): assign `nothrow` attribute.
+ pub fn free(ptr: *mut c_void);
+
+ // Internal declaration of the syscall. Exported variant has
+ // the `char_size` parameter set to `1` (single char size in bytes).
+ fn __assertfail(
+ message: *const u8,
+ file: *const u8,
+ line: u32,
+ function: *const u8,
+ char_size: usize,
+ );
+}
+
+/// Syscall to be used whenever the *assert expression produces a `false` value*.
+///
+/// Syscall arguments:
+/// * `message`: The pointer to the string that should be output.
+/// * `file`: The pointer to the file name string associated with the assert.
+/// * `line`: The line number associated with the assert.
+/// * `function`: The pointer to the function name string associated with the assert.
+///
+/// Source:
+/// [PTX Interoperability](https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls).
+#[inline]
+pub unsafe fn __assert_fail(message: *const u8, file: *const u8, line: u32, function: *const u8) {
+ __assertfail(message, file, line, function, 1)
+}