aboutsummaryrefslogtreecommitdiffhomepage
path: root/zluda/src/impl
diff options
context:
space:
mode:
Diffstat (limited to 'zluda/src/impl')
-rw-r--r--zluda/src/impl/context.rs11
-rw-r--r--zluda/src/impl/device.rs106
-rw-r--r--zluda/src/impl/memory.rs11
-rw-r--r--zluda/src/impl/mod.rs30
-rw-r--r--zluda/src/impl/stream.rs56
5 files changed, 198 insertions, 16 deletions
diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs
index 5ef427e..9ea0874 100644
--- a/zluda/src/impl/context.rs
+++ b/zluda/src/impl/context.rs
@@ -257,9 +257,14 @@ pub fn detach(pctx: *mut Context) -> Result<(), CUresult> {
})?
}
-pub(crate) fn synchronize() -> CUresult {
- // TODO: change the implementation once we do async stream operations
- CUresult::CUDA_SUCCESS
+pub(crate) fn synchronize() -> Result<(), CUresult> {
+ GlobalState::lock_current_context(|ctx| {
+ ctx.default_stream.synchronize()?;
+ for stream in ctx.streams.iter().copied() {
+ unsafe { &mut *stream }.synchronize()?;
+ }
+ Ok(())
+ })?
}
#[cfg(test)]
diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs
index 63bf39f..0594252 100644
--- a/zluda/src/impl/device.rs
+++ b/zluda/src/impl/device.rs
@@ -1,4 +1,4 @@
-use super::{context, CUresult, GlobalState};
+use super::{context, transmute_lifetime, transmute_lifetime_mut, CUresult, GlobalState};
use crate::cuda;
use cuda::{CUdevice_attribute, CUuuid_st};
use std::{
@@ -21,6 +21,7 @@ pub struct Device {
pub default_queue: l0::CommandQueue<'static>,
pub l0_context: l0::Context,
pub primary_context: context::Context,
+ pub event_pool: DynamicEventPool,
properties: Option<Box<l0::sys::ze_device_properties_t>>,
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
@@ -42,12 +43,14 @@ impl Device {
true,
ptr::null_mut(),
)?);
+ let event_pool = DynamicEventPool::new(l0_dev, transmute_lifetime(&ctx))?;
Ok(Self {
index: Index(idx as c_int),
base: l0_dev,
default_queue: queue,
l0_context: ctx,
primary_context: primary_context,
+ event_pool,
properties: None,
image_properties: None,
memory_properties: None,
@@ -395,8 +398,103 @@ pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult {
CUresult::CUDA_SUCCESS
}
+pub struct DynamicEventPool {
+ count: usize,
+ events: Vec<DynamicEventPoolEntry>,
+}
+
+impl DynamicEventPool {
+ fn new(dev: l0::Device, ctx: &'static l0::Context) -> l0::Result<Self> {
+ Ok(DynamicEventPool {
+ count: 0,
+ events: vec![DynamicEventPoolEntry::new(dev, ctx)?],
+ })
+ }
+
+ pub fn get(
+ &'static mut self,
+ dev: l0::Device,
+ ctx: &'static l0::Context,
+ ) -> l0::Result<(l0::Event<'static>, u64)> {
+ self.count += 1;
+ let events = unsafe { transmute_lifetime_mut(&mut self.events) };
+ let (global_idx, (ev, local_idx)) = {
+ for (idx, entry) in self.events.iter_mut().enumerate() {
+ if let Some((ev, local_idx)) = entry.get()? {
+ let marker = (idx << 32) as u64 | local_idx as u64;
+ return Ok((ev, marker));
+ }
+ }
+ events.push(DynamicEventPoolEntry::new(dev, ctx)?);
+ let global_idx = (events.len() - 1) as u64;
+ (global_idx, events.last_mut().unwrap().get()?.unwrap())
+ };
+ let marker = (global_idx << 32) | local_idx as u64;
+ Ok((ev, marker))
+ }
+
+ pub fn mark_as_free(&mut self, marker: u64) {
+ let global_idx = (marker >> 32) as u32;
+ self.events[global_idx as usize].mark_as_free(marker as u32);
+ self.count -= 1;
+ // TODO: clean up empty entries
+ }
+}
+
+const DYNAMIC_EVENT_POOL_ENTRY_SIZE: usize = 448;
+const DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE: usize =
+ DYNAMIC_EVENT_POOL_ENTRY_SIZE / (mem::size_of::<u64>() * 8);
+#[repr(C)]
+#[repr(align(64))]
+struct DynamicEventPoolEntry {
+ event_pool: l0::EventPool<'static>,
+ bit_map: [u64; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE],
+}
+
+impl DynamicEventPoolEntry {
+ fn new(dev: l0::Device, ctx: &'static l0::Context) -> l0::Result<Self> {
+ Ok(DynamicEventPoolEntry {
+ event_pool: l0::EventPool::new(
+ ctx,
+ DYNAMIC_EVENT_POOL_ENTRY_SIZE as u32,
+ Some(&[dev]),
+ )?,
+ bit_map: [0; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE],
+ })
+ }
+
+ fn get(&'static mut self) -> l0::Result<Option<(l0::Event<'static>, u32)>> {
+ for (idx, value) in self.bit_map.iter_mut().enumerate() {
+ let shift = first_index_of_zero_u64(*value);
+ if shift == 64 {
+ continue;
+ }
+ *value = *value | (1u64 << shift);
+ let entry_index = (idx as u32 * 64u32) + shift;
+ let event = l0::Event::new(&self.event_pool, entry_index)?;
+ return Ok(Some((event, entry_index)));
+ }
+ Ok(None)
+ }
+
+ fn mark_as_free(&mut self, idx: u32) {
+ let value = &mut self.bit_map[idx as usize / 64];
+ let shift = idx % 64;
+ *value = *value & !(1 << shift);
+ }
+}
+
+fn first_index_of_zero_u64(x: u64) -> u32 {
+ let x = !x;
+ (x & x.wrapping_neg()).trailing_zeros()
+}
+
#[cfg(test)]
mod test {
+ use std::mem;
+
+ use super::DynamicEventPoolEntry;
+
use super::super::test::CudaDriverFns;
use super::super::CUresult;
@@ -413,4 +511,10 @@ mod test {
assert_eq!(flags, 0);
assert_eq!(active, 0);
}
+
+ #[test]
+ pub fn dynamic_event_pool_page_is_64b() {
+ assert_eq!(mem::size_of::<DynamicEventPoolEntry>(), 64);
+ assert_eq!(mem::align_of::<DynamicEventPoolEntry>(), 64);
+ }
}
diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs
index 238d68e..81b4f31 100644
--- a/zluda/src/impl/memory.rs
+++ b/zluda/src/impl/memory.rs
@@ -11,13 +11,10 @@ pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult>
}
pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
- GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream| {
- let cmd_list = stream.command_list()?;
- unsafe { cmd_list.append_memory_copy_raw(dst, src, bytesize, None, &mut [])? };
- cmd_list.close()?;
- stream.queue.execute_and_synchronize(cmd_list)?;
- Ok::<_, CUresult>(())
- })?
+ GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
+ unsafe { cmd_list.append_memory_copy_raw(dst, src, bytesize, Some(signal), wait)? };
+ Ok::<_, l0::sys::ze_result_t>(())
+ })
}
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs
index 55c047f..48b5fb5 100644
--- a/zluda/src/impl/mod.rs
+++ b/zluda/src/impl/mod.rs
@@ -273,6 +273,32 @@ impl GlobalState {
}
}
+ fn lock_enqueue(
+ stream: *mut stream::Stream,
+ f: impl FnOnce(
+ &mut l0::CommandList,
+ &l0::Event<'static>,
+ &[&l0::Event<'static>],
+ ) -> l0::Result<()>,
+ ) -> Result<(), CUresult> {
+ Self::lock_stream(stream, |stream_data| {
+ let l0_dev = unsafe { (*(*stream_data.context).device).base };
+ let l0_ctx = unsafe { &mut (*(*stream_data.context).device).l0_context };
+ let event_pool = unsafe { &mut (*(*stream_data.context).device).event_pool };
+ let mut cmd_list = unsafe { mem::transmute(stream_data.command_list()?) };
+ stream_data
+ .process_finished_events(&mut |(_, marker)| event_pool.mark_as_free(marker))?;
+ let prev_event = stream_data.get_last_event();
+ let prev_event_array = prev_event.map(|e| [e]);
+ let empty = [];
+ let prev_event_slice = prev_event_array.as_ref().map_or(&empty[..], |arr| &arr[..]);
+ let (new_event, new_marker) = event_pool.get(l0_dev, l0_ctx)?;
+ f(&mut cmd_list, &new_event, prev_event_slice)?;
+ stream_data.push_event((new_event, new_marker));
+ Ok(())
+ })?
+ }
+
fn lock_function<T>(
func: *mut function::Function,
f: impl FnOnce(&mut function::FunctionData) -> T,
@@ -421,6 +447,10 @@ pub(crate) fn get_error_string(error: CUresult, str: *mut *const i8) -> CUresult
}
}
+unsafe fn transmute_lifetime<'a, 'b, T: ?Sized>(t: &'a T) -> &'b T {
+ mem::transmute(t)
+}
+
unsafe fn transmute_lifetime_mut<'a, 'b, T: ?Sized>(t: &'a mut T) -> &'b mut T {
mem::transmute(t)
}
diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs
index 0fafe92..11f1869 100644
--- a/zluda/src/impl/stream.rs
+++ b/zluda/src/impl/stream.rs
@@ -2,7 +2,7 @@ use super::{
context::{Context, ContextData},
CUresult, GlobalState,
};
-use std::{mem, ptr};
+use std::{collections::VecDeque, mem, ptr};
use super::{HasLivenessCookie, LiveCheck};
@@ -34,21 +34,27 @@ impl HasLivenessCookie for StreamData {
pub struct StreamData {
pub context: *mut ContextData,
pub queue: l0::CommandQueue<'static>,
+ pub prev_events: VecDeque<(l0::Event<'static>, u64)>,
}
impl StreamData {
- pub fn new_unitialized(ctx: &'static l0::Context, dev: l0::Device) -> Result<Self, CUresult> {
+ pub fn new_unitialized(
+ ctx: &'static l0::Context,
+ device: l0::Device,
+ ) -> Result<Self, CUresult> {
Ok(StreamData {
context: ptr::null_mut(),
- queue: l0::CommandQueue::new(ctx, dev)?,
+ queue: l0::CommandQueue::new(ctx, device)?,
+ prev_events: VecDeque::new(),
})
}
pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
let l0_ctx = &mut unsafe { &mut *ctx.device }.l0_context;
- let l0_dev = unsafe { &*ctx.device }.base;
+ let device = unsafe { &*ctx.device }.base;
Ok(StreamData {
context: ctx as *mut _,
- queue: l0::CommandQueue::new(l0_ctx, l0_dev)?,
+ queue: l0::CommandQueue::new(l0_ctx, device)?,
+ prev_events: VecDeque::new(),
})
}
@@ -57,6 +63,39 @@ impl StreamData {
let dev = unsafe { &mut *ctx.device };
l0::CommandList::new(&mut dev.l0_context, dev.base)
}
+
+ pub fn process_finished_events(
+ &mut self,
+ f: &mut impl FnMut((l0::Event<'static>, u64)),
+ ) -> l0::Result<()> {
+ loop {
+ match self.prev_events.get(0) {
+ None => return Ok(()),
+ Some((ev, _)) => {
+ if ev.is_ready()? {
+ f(self.prev_events.pop_front().unwrap());
+ } else {
+ return Ok(());
+ }
+ }
+ }
+ }
+ }
+
+ pub fn get_last_event(&self) -> Option<&l0::Event<'static>> {
+ self.prev_events.iter().next_back().map(|(ev, _)| ev)
+ }
+
+ pub fn push_event(&mut self, ev: (l0::Event<'static>, u64)) {
+ self.prev_events.push_back(ev);
+ }
+
+ pub fn synchronize(&mut self) -> l0::Result<()> {
+ self.queue.synchronize(u64::MAX)?;
+ let event_pool = unsafe { &mut (*(*self.context).device).event_pool };
+ self.process_finished_events(&mut |(_, marker)| event_pool.mark_as_free(marker))?;
+ Ok(())
+ }
}
impl Drop for StreamData {
@@ -102,6 +141,13 @@ pub(crate) fn destroy_v2(pstream: *mut Stream) -> Result<(), CUresult> {
GlobalState::lock(|_| Stream::destroy_impl(pstream))?
}
+pub(crate) fn synchronize(pstream: *mut Stream) -> Result<(), CUresult> {
+ if pstream == ptr::null_mut() {
+ return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
+ }
+ GlobalState::lock_stream(pstream, |stream_data| Ok(stream_data.synchronize()?))?
+}
+
#[cfg(test)]
mod test {
use crate::cuda::CUstream;