From fe80efc55c38528e032ab9f671fa8bc420b31e4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Wed, 8 Oct 2025 14:58:59 +0200 Subject: [PATCH 01/12] =?UTF-8?q?feat:=E2=80=AFadd=20support=20for=20resiz?= =?UTF-8?q?ing=20tensors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build.rs | 1 - shaders/stensor/utils/mat.slang | 0 src/lib.rs | 2 +- src/linalg/contiguous.rs | 8 +- src/linalg/gemm.rs | 8 +- src/linalg/gemv.rs | 8 +- src/linalg/op_assign.rs | 8 +- src/linalg/reduce.rs | 4 +- src/linalg/repeat.rs | 4 +- src/shapes.rs | 2 +- src/tensor.rs | 215 +++++++++++++++++++++++++++++--- 11 files changed, 218 insertions(+), 42 deletions(-) create mode 100644 shaders/stensor/utils/mat.slang diff --git a/build.rs b/build.rs index abd5388..ac0db9d 100644 --- a/build.rs +++ b/build.rs @@ -15,4 +15,3 @@ pub fn main() { slang.compile_all(target, "../shaders", "./src/autogen", &[]); } } - diff --git a/shaders/stensor/utils/mat.slang b/shaders/stensor/utils/mat.slang new file mode 100644 index 0000000..e69de29 diff --git a/src/lib.rs b/src/lib.rs index edcab79..2eed99e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,8 +10,8 @@ use minislang::SlangCompiler; pub mod geometry; pub mod linalg; -pub mod tensor; pub mod shapes; +pub mod tensor; // pub mod utils; diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs index 32be4dd..d3bafaa 100644 --- a/src/linalg/contiguous.rs +++ b/src/linalg/contiguous.rs @@ -1,7 +1,7 @@ -use slang_hal::backend::Backend; -use slang_hal::function::GpuFunction; use crate::shapes::{MatrixOrdering, ViewShape, ViewShapeBuffers}; use crate::tensor::GpuTensorView; +use slang_hal::backend::Backend; +use slang_hal::function::GpuFunction; use slang_hal::{Shader, ShaderArgs}; #[derive(Shader)] @@ -58,13 +58,13 @@ impl Contiguous { #[cfg(test)] mod test { + use crate::shapes::ViewShapeBuffers; + use crate::tensor::GpuTensor; use minislang::SlangCompiler; use nalgebra::DMatrix; use slang_hal::Shader; use slang_hal::backend::WebGpu; use slang_hal::backend::{Backend, Encoder}; - use crate::shapes::ViewShapeBuffers; - use crate::tensor::GpuTensor; use wgpu::{BufferUsages, Features, Limits}; #[futures_test::test] diff --git a/src/linalg/gemm.rs b/src/linalg/gemm.rs index 95a522a..fa2f2b5 100644 --- a/src/linalg/gemm.rs +++ b/src/linalg/gemm.rs @@ -1,7 +1,7 @@ -use slang_hal::backend::Backend; -use slang_hal::function::GpuFunction; use crate::shapes::{ViewShape, ViewShapeBuffers}; use crate::tensor::GpuTensorView; +use slang_hal::backend::Backend; +use slang_hal::function::GpuFunction; use slang_hal::{Shader, ShaderArgs}; #[derive(Shader)] @@ -197,13 +197,13 @@ impl Gemm { #[cfg(test)] mod test { use crate::GemmVariant; + use crate::shapes::ViewShapeBuffers; + use crate::tensor::GpuTensor; use approx::relative_eq; use minislang::SlangCompiler; use nalgebra::DMatrix; use slang_hal::Shader; use slang_hal::backend::{Backend, Encoder, WebGpu}; - use crate::shapes::ViewShapeBuffers; - use crate::tensor::GpuTensor; use wgpu::{BufferUsages, Features, Limits}; #[futures_test::test] diff --git a/src/linalg/gemv.rs b/src/linalg/gemv.rs index 66e6c6b..bc4df73 100644 --- a/src/linalg/gemv.rs +++ b/src/linalg/gemv.rs @@ -1,7 +1,7 @@ -use slang_hal::backend::Backend; -use slang_hal::function::GpuFunction; use crate::shapes::{MatrixOrdering, ViewShape, ViewShapeBuffers}; use crate::tensor::GpuTensorView; +use slang_hal::backend::Backend; +use slang_hal::function::GpuFunction; use slang_hal::{Shader, ShaderArgs}; /// Indicates if a matrix needs to be considered as-is or as its transpose when running a matrix @@ -308,14 +308,14 @@ impl Gemv { #[cfg(test)] mod test { use crate::GemvVariant; + use crate::shapes::ViewShapeBuffers; + use crate::tensor::GpuTensor; use approx::assert_relative_eq; use minislang::SlangCompiler; use nalgebra::{DMatrix, DVector}; use slang_hal::Shader; use slang_hal::backend::WebGpu; use slang_hal::backend::{Backend, Encoder}; - use crate::shapes::ViewShapeBuffers; - use crate::tensor::GpuTensor; use wgpu::{BufferUsages, Features, Limits}; #[futures_test::test] diff --git a/src/linalg/op_assign.rs b/src/linalg/op_assign.rs index 5398556..97cb00d 100644 --- a/src/linalg/op_assign.rs +++ b/src/linalg/op_assign.rs @@ -1,7 +1,7 @@ -use slang_hal::backend::Backend; -use slang_hal::function::GpuFunction; use crate::shapes::{ViewShape, ViewShapeBuffers}; use crate::tensor::GpuTensorView; +use slang_hal::backend::Backend; +use slang_hal::function::GpuFunction; use slang_hal::{Shader, ShaderArgs}; #[derive(Copy, Clone, PartialEq, Eq, Debug)] @@ -155,13 +155,13 @@ impl OpAssign { #[cfg(test)] mod test { use super::{BinOpArgs, OpAssignVariant}; + use crate::shapes::ViewShapeBuffers; + use crate::tensor::GpuTensor; use minislang::SlangCompiler; use nalgebra::DVector; use slang_hal::backend::WebGpu; use slang_hal::backend::{Backend, Buffer, Encoder}; use slang_hal::shader::Shader; - use crate::shapes::ViewShapeBuffers; - use crate::tensor::GpuTensor; use wgpu::BufferUsages; #[futures_test::test] diff --git a/src/linalg/reduce.rs b/src/linalg/reduce.rs index a6f06bc..e39de6a 100644 --- a/src/linalg/reduce.rs +++ b/src/linalg/reduce.rs @@ -53,14 +53,14 @@ pub struct Reduce { #[cfg(test)] mod test { use super::ReduceVariant; + use crate::shapes::{ViewShape, ViewShapeBuffers}; + use crate::tensor::GpuTensor; use minislang::SlangCompiler; use nalgebra::DVector; use slang_hal::ShaderArgs; use slang_hal::backend::WebGpu; use slang_hal::backend::{Backend, Encoder}; use slang_hal::shader::Shader; - use crate::shapes::{ViewShape, ViewShapeBuffers}; - use crate::tensor::GpuTensor; use wgpu::BufferUsages; #[derive(ShaderArgs)] diff --git a/src/linalg/repeat.rs b/src/linalg/repeat.rs index 319c057..c5f297b 100644 --- a/src/linalg/repeat.rs +++ b/src/linalg/repeat.rs @@ -1,7 +1,7 @@ -use slang_hal::backend::Backend; -use slang_hal::function::GpuFunction; use crate::shapes::{ViewShape, ViewShapeBuffers}; use crate::tensor::GpuTensorView; +use slang_hal::backend::Backend; +use slang_hal::function::GpuFunction; use slang_hal::{Shader, ShaderArgs}; /// Slang module for replicating the content of a source tensor as many times as possible to fill diff --git a/src/shapes.rs b/src/shapes.rs index b9baae3..47619ef 100644 --- a/src/shapes.rs +++ b/src/shapes.rs @@ -268,7 +268,7 @@ impl ViewShapeBuffers { let mut recycled = self.recycled.lock().unwrap(); let buffer = if let Some(mut buffer) = recycled.pop() { - backend.write_buffer(&mut buffer, &[shape])?; + backend.write_buffer(&mut buffer, 0, &[shape])?; buffer } else { // println!("Couldn’t find recycling for {:?}", shape); diff --git a/src/tensor.rs b/src/tensor.rs index 6cbb2f4..c486ef6 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -3,19 +3,20 @@ // TODO: feels like this should be in stensor instead of slang-hal -use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding}; use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape}; use bytemuck::Pod; use encase::ShaderType; use nalgebra::{Dim, IsContiguous, Matrix, Storage}; +use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding}; +use std::ops::{Bound, RangeBounds}; use std::sync::Arc; use slang_hal::backend::WebGpu; use wgpu::BufferUsages; -use slang_hal::ShaderArgs; #[cfg(feature = "cuda")] use crate::cuda::Cuda; +use slang_hal::ShaderArgs; use slang_hal::shader::ShaderArgsError; /// Helper struct for creating gpu storage buffers (scalars, vectors, matrices, tensors). @@ -69,16 +70,12 @@ impl TensorBuilder { self } - /// Builds the gpu tensor. - /// - /// # Safety - /// - /// The returned buffer must be initialized before being read from. - pub unsafe fn build_uninit( + /// Builds the uninitialized gpu tensor. + pub fn build_uninit( self, backend: &B, ) -> Result, B::Error> { - let buffer = unsafe { backend.uninit_buffer(self.len() as usize, self.usage)? }; + let buffer = backend.uninit_buffer(self.len() as usize, self.usage)?; Ok(GpuTensor { shape: self.shape, buffer, @@ -91,11 +88,11 @@ impl TensorBuilder { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn build_uninit_encased( + pub fn build_uninit_encased( self, backend: &B, ) -> Result, B::Error> { - let buffer = unsafe { backend.uninit_buffer_encased(self.len() as usize, self.usage)? }; + let buffer = backend.uninit_buffer_encased(self.len() as usize, self.usage)?; Ok(GpuTensor { shape: self.shape, buffer, @@ -215,6 +212,25 @@ impl GpuTensor { self.shape.into_iter().map(|s| s as u64).product() } + /// The maximum number of elements this tensor can hold without needing a resize of the + /// underlying GPU buffer. + pub fn capacity(&self) -> u64 + where T: Pod { + self.buffer.len() as u64 + } + + /// The maximum number of elements this tensor can hold without needing a resize of the + /// underlying GPU buffer. + pub fn capacity_encased(&self) -> u64 + where T: EncaseType { + self.buffer.len_encased() as u64 + } + + /// The tensor’s order (i.e. the number of dimensions with a size > 1). + pub fn order(&self) -> u8 { + self.shape.iter().map(|s| (*s > 1) as u8).sum() + } + /// Size of this tensor along the dimension `i`. pub fn size(&self, i: usize) -> u32 { self.shape[i] @@ -354,6 +370,20 @@ impl GpuTensor { } } + fn vector_dim(&self) -> usize { + let dim = match self.ordering { + MatrixOrdering::RowMajor => 1, + MatrixOrdering::ColumnMajor => 0, + }; + let mut required_shape = [1; 4]; + required_shape[dim] = self.shape[dim]; + assert_eq!( + required_shape, self.shape, + "Operation only supported on vector tensors." + ); + dim + } + // /// Reads the buffer’s content into a vector. // pub async fn read_bytes<'a>(&'a self, device: &'a Device) -> anyhow::Result> { // // TODO: could probably be optimized? @@ -486,7 +516,8 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { /// its underlying `GpuTensor`. /// /// If it matches, returns the tensor's matrix ordering. - pub fn is_entire_tensor(&self) -> Option { + pub fn is_entire_tensor(&self) -> Option + where T: Pod { if self.buffer.len() == self.len() as usize && self.offset == 0 { self.is_contiguous() } else { @@ -699,7 +730,8 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { /// its underlying `GpuTensor`. /// /// If it matches, returns the tensor's matrix ordering. - pub fn is_entire_tensor(&self) -> Option { + pub fn is_entire_tensor(&self) -> Option + where T: Pod { self.as_ref().is_entire_tensor() } @@ -942,7 +974,7 @@ impl GpuTensor { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn vector_uninit( + pub fn vector_uninit( backend: &B, len: u32, usage: BufferUsages, @@ -950,7 +982,7 @@ impl GpuTensor { where T: DeviceValue + Pod, { - unsafe { TensorBuilder::vector(len, usage).build_uninit(backend) } + TensorBuilder::vector(len, usage).build_uninit(backend) } /// Allocates a new vector on the gpu initialized from `vector`. @@ -972,7 +1004,7 @@ impl GpuTensor { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn vector_uninit_encased( + pub fn vector_uninit_encased( backend: &B, len: u32, usage: BufferUsages, @@ -980,7 +1012,7 @@ impl GpuTensor { where T: DeviceValue + EncaseType, { - unsafe { TensorBuilder::vector(len, usage).build_uninit_encased(backend) } + TensorBuilder::vector(len, usage).build_uninit_encased(backend) } /// Allocates a new vector on the gpu initialized from `vector`. @@ -1017,11 +1049,11 @@ impl GpuTensor { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn scalar_uninit_encased(backend: &B, usage: BufferUsages) -> Result + pub fn scalar_uninit_encased(backend: &B, usage: BufferUsages) -> Result where T: DeviceValue + EncaseType, { - unsafe { TensorBuilder::scalar(usage).build_uninit_encased(backend) } + TensorBuilder::scalar(usage).build_uninit_encased(backend) } /// Allocates a new gpu storage buffer with a single element initialized to `value`. @@ -1054,3 +1086,148 @@ impl<'b, B: Backend, T: DeviceValue> ShaderArgs<'b, B> for GpuTensor { self.buffer.write_arg(binding, name, dispatch) } } + + +macro_rules! append_and_remove( + ($append: ident, $shift_remove: ident, $TraitBound: ident, $capacity: ident, $copy_buffer_to_buffer: ident, $uninit_buffer: ident, $write_buffer: ident) => { + /// Append the `data` elements at the end of this tensor if it is a vector. + /// + /// Panics if the tensor isn’t a vector. The tensor is a vector if: + /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where + /// `*` is any non-zero positive integer). + /// - It is a column-major tensor and its size is made of a single column. Its size is + /// `[*, 1, 1, 1]` (where `*` is any non-zero positive integer). + /// + /// If the underlying GPU buffer is too small to contain the extra elements, it is automatically + /// resized. If a resize happens, the tensor’s capacity is the next power of two sufficient + /// to contain the appended data. + // TODO: broadcast automatically to generalize to any tensor order. + pub fn $append(&mut self, backend: &B, data: &[T]) -> Result<(), B::Error> + where + T: $TraitBound, + { + let dim_to_grow = self.vector_dim(); + let num_added = data.len(); + let curr_len = self.shape[dim_to_grow]; + let new_len = curr_len + num_added as u32; + + let mut encoder = backend.begin_encoding(); + + + if new_len as u64 >= self.$capacity() { + // We need to grow the buffer. + let new_capacity = new_len.next_power_of_two(); + // SAFETY: will be initialized by the buffer init. + let mut new_buffer = backend.$uninit_buffer( + new_capacity as usize, + self.buffer().usage() | BufferUsages::COPY_DST + )?; + + encoder.$copy_buffer_to_buffer( + &self.buffer, + 0, + &mut new_buffer, + 0, + curr_len as usize, + )?; + self.buffer = new_buffer; + } + + backend.$write_buffer(&mut self.buffer, curr_len as u64, data)?; + backend.submit(encoder)?; + self.shape[dim_to_grow] = new_len; + Ok(()) + } + + /// Removes a `range` of elements from this tensor if it is a vector, shifting back elements to + /// fill the gap. + /// + /// Panics if the tensor isn’t a vector. The tensor is a vector if: + /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where + /// `*` is any non-zero positive integer). + /// - It is a column-major tensor and its size is made of a single column. Its size is + /// `[*, 1, 1, 1]` (where `*` is any non-zero positive integer). + /// + /// This method doesn’t change the tensor’s capacity so the internal GPU buffer isn’t resized. + /// + /// # Performance note + /// + /// This method is currently fairly expensive as it always involves the creation of a staging + /// buffer for copying the data being moved. The staging buffer size is equal to the number of + /// moved elements. + /// + /// # Panic + /// + /// Panics if `self` wasn’t created with the `BufferUsages::COPY_SRC | BufferUsages::COPY_DST` flags. + /// Panics if the range is out of the bounds of `self`. + /// + /// # Return + /// + /// If the operation suceeded, returns the number of removed elements. + // TODO: add a special case for targets capable of copying slices within the same buffer. + // TODO: it would be worth benchmarking with doing the shift with a compute shader instead. + pub fn $shift_remove( + &mut self, + backend: &B, + range: impl RangeBounds, + ) -> Result + where T: $TraitBound { + let dim_to_shrink = self.vector_dim(); + let curr_len = self.shape[dim_to_shrink] as usize; + let range_start = match range.start_bound() { + Bound::Included(i) => *i, + Bound::Excluded(i) => *i + 1, + Bound::Unbounded => 0, + }; + let range_end = match range.end_bound() { + Bound::Included(i) => *i + 1, + Bound::Excluded(i) => *i, + Bound::Unbounded => curr_len, + }; + + if range_end <= range_start { + // The range to remove is empty. + return Ok(0); + } + + assert!(range_end <= curr_len, "Range index out of bounds."); + let num_elements_to_move = curr_len - range_end; + + // NOTE: if `curr_end == range_end` we don’t actually need to move any data, shrinking + // the shape is sufficient. + if num_elements_to_move > 0 { + // SAFETY: will be initialized with a buffer-to-buffer copy. + let mut staging = backend.$uninit_buffer( + num_elements_to_move, + BufferUsages::STORAGE | BufferUsages::COPY_DST | BufferUsages::COPY_SRC, + )?; + + let mut encoder = backend.begin_encoding(); + encoder.$copy_buffer_to_buffer( + &self.buffer, + range_end, + &mut staging, + 0, + num_elements_to_move, + )?; + encoder.$copy_buffer_to_buffer( + &staging, + 0, + &mut self.buffer, + range_start, + num_elements_to_move, + )?; + backend.submit(encoder)?; + } + + let num_removed = range_end - range_start; + self.shape[dim_to_shrink] -= num_removed as u32; + Ok(num_removed) + } + } +); + +impl GpuTensor { + append_and_remove!(append, shift_remove, Pod, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer); + append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased); +} \ No newline at end of file From cd8e9300121460fc17af07cf05ca9842ee1e4a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Wed, 8 Oct 2025 14:59:28 +0200 Subject: [PATCH 02/12] =?UTF-8?q?feat:=E2=80=AFadd=20diagonal=20and=20trac?= =?UTF-8?q?e=20functions=20for=20matrices?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- shaders/stensor/utils/mat.slang | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/shaders/stensor/utils/mat.slang b/shaders/stensor/utils/mat.slang index e69de29..f62ec56 100644 --- a/shaders/stensor/utils/mat.slang +++ b/shaders/stensor/utils/mat.slang @@ -0,0 +1,55 @@ +module mat; + +/* + * Trace of a matrix. + */ + +/// The trace of a 2x2 matrix. +public func trace(m: float2x2) -> float { + return m[0][0] + m[1][1]; +} + +/// The trace of a 3x3 matrix. +public func trace(m: float3x3) -> float { + return m[0][0] + m[1][1] + m[2][2]; +} + +/// The trace of a 4x4 matrix. +public func trace(m: float4x4) -> float { + return m[0][0] + m[1][1] + m[2][2] + m[3][3]; +} + +/* + * Diagonal extraction and diagonal matrix init. + */ + +/// Initializes a diagonal 2x2 matrix. +public func diag(d: float2) -> float2x2 { + return float2x2(float2(d.x, 0.0), float2(0.0, d.y)); +} + +/// Initializes a diagonal 3x3 matrix. +public func diag(d: float3) -> float3x3 { + return float3x3(float3(d.x, 0.0, 0.0), float3(0.0, d.y, 0.0), float3(0.0, 0.0, d.z)); +} + +/// Initializes a diagonal 4x4 matrix. +public func diag(d: float4) -> float4x4 { + return float4x4(float4(d.x, 0.0, 0.0, 0.0), float4(0.0, d.y, 0.0, 0.0), float4(0.0, 0.0, d.z, 0.0), float4(0.0, 0.0, 0.0, d.w)); +} + + +/// Return the diagonal of a 2x2 matrix. +public func diag(m: float2x2) -> float2 { + return float2(m[0][0], m[1][1]); +} + +/// Return the diagonal of a 3x3 matrix. +public func diag(m: float3x3) -> float3 { + return float3(m[0][0], m[1][1], m[2][2]); +} + +/// Return the diagonal of a 4x4 matrix. +public func diag(m: float4x4) -> float4 { + return float4(m[0][0], m[1][1], m[2][2], m[3][3]); +} \ No newline at end of file From f00dd884297f3ea19e4e3e9c77ce7025b462ed0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Tue, 21 Oct 2025 10:19:59 +0200 Subject: [PATCH 03/12] =?UTF-8?q?feat:=E2=80=AFweaken=20trait=20requiremen?= =?UTF-8?q?ts=20for=20tensor=20read/init/writes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tensor.rs | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/tensor.rs b/src/tensor.rs index c486ef6..2812c6c 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -4,7 +4,7 @@ // TODO: feels like this should be in stensor instead of slang-hal use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape}; -use bytemuck::Pod; +use bytemuck::{NoUninit}; use encase::ShaderType; use nalgebra::{Dim, IsContiguous, Matrix, Storage}; use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding}; @@ -71,7 +71,7 @@ impl TensorBuilder { } /// Builds the uninitialized gpu tensor. - pub fn build_uninit( + pub fn build_uninit( self, backend: &B, ) -> Result, B::Error> { @@ -127,7 +127,7 @@ impl TensorBuilder { // } /// Builds this tensor with an array of values given for its initial value. - pub fn build_init( + pub fn build_init( self, backend: &B, data: &[T], @@ -212,10 +212,15 @@ impl GpuTensor { self.shape.into_iter().map(|s| s as u64).product() } + // /// The tensor’s rank. + // pub fn rank(&self) -> u64 { + // self.shape.iter().filter(|i| **i != 1).count() as u64 + // } + /// The maximum number of elements this tensor can hold without needing a resize of the /// underlying GPU buffer. pub fn capacity(&self) -> u64 - where T: Pod { + where T: NoUninit { self.buffer.len() as u64 } @@ -293,7 +298,7 @@ impl GpuTensor { source: impl Into>, ) -> Result<(), B::Error> where - T: DeviceValue + Pod, + T: DeviceValue + NoUninit, { let source = source.into(); let copy_len = self.len(); @@ -517,7 +522,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { /// /// If it matches, returns the tensor's matrix ordering. pub fn is_entire_tensor(&self) -> Option - where T: Pod { + where T: NoUninit { if self.buffer.len() == self.len() as usize && self.offset == 0 { self.is_contiguous() } else { @@ -731,7 +736,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { /// /// If it matches, returns the tensor's matrix ordering. pub fn is_entire_tensor(&self) -> Option - where T: Pod { + where T: NoUninit { self.as_ref().is_entire_tensor() } @@ -902,13 +907,13 @@ impl GpuTensor { } } -impl GpuTensor { +impl GpuTensor { /// Allocates a new matrix on the gpu with uninitialized elements. /// /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn matrix_uninit( + pub fn matrix_uninit( backend: &B, nrows: u32, ncols: u32, @@ -917,7 +922,7 @@ impl GpuTensor { where T: DeviceValue, { - unsafe { TensorBuilder::matrix(nrows, ncols, usage).build_uninit(backend) } + TensorBuilder::matrix(nrows, ncols, usage).build_uninit(backend) } // pub fn uninit_encased(device: &Device, nrows: u32, ncols: u32, usage: BufferUsages) -> Self @@ -980,21 +985,21 @@ impl GpuTensor { usage: BufferUsages, ) -> Result where - T: DeviceValue + Pod, + T: DeviceValue + NoUninit, { TensorBuilder::vector(len, usage).build_uninit(backend) } /// Allocates a new vector on the gpu initialized from `vector`. /// - /// If `T` does not implement `Pod`, use [`GpuMatrix::encase`] instead. + /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead. pub fn vector( backend: &B, vector: impl AsRef<[T]>, usage: BufferUsages, ) -> Result where - T: DeviceValue + Pod, + T: DeviceValue + NoUninit, { let v = vector.as_ref(); TensorBuilder::vector(v.len() as u32, usage).build_init(backend, v.as_ref()) @@ -1017,7 +1022,7 @@ impl GpuTensor { /// Allocates a new vector on the gpu initialized from `vector`. /// - /// If `T` does not implement `Pod`, use [`GpuMatrix::encase`] instead. + /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead. pub fn vector_encased( backend: &B, vector: impl AsRef<[T]>, @@ -1037,11 +1042,11 @@ impl GpuTensor { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub unsafe fn scalar_uninit(backend: &B, usage: BufferUsages) -> Result + pub fn scalar_uninit(backend: &B, usage: BufferUsages) -> Result where - T: DeviceValue + Pod, + T: DeviceValue + NoUninit, { - unsafe { TensorBuilder::scalar(usage).build_uninit(backend) } + TensorBuilder::scalar(usage).build_uninit(backend) } /// Allocates a new gpu storage buffer with a single uninitialized element. @@ -1059,7 +1064,7 @@ impl GpuTensor { /// Allocates a new gpu storage buffer with a single element initialized to `value`. pub fn scalar(backend: &B, value: T, usage: BufferUsages) -> Result where - T: DeviceValue + Pod, + T: DeviceValue + NoUninit, { TensorBuilder::scalar(usage).build_init(backend, &[value]) } @@ -1228,6 +1233,6 @@ macro_rules! append_and_remove( ); impl GpuTensor { - append_and_remove!(append, shift_remove, Pod, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer); + append_and_remove!(append, shift_remove, NoUninit, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer); append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased); } \ No newline at end of file From c092544a852bf1a7a7fde58d8923401d101a3406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 16:00:29 +0100 Subject: [PATCH 04/12] feat: update to wgpu 27 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index db060ba..9ba15e8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ cuda = [ "cudarc", "slang-hal/cuda" ] cublas = [ "slang-hal/cublas" ] [dependencies] -wgpu = "26" +wgpu = "27" encase = "0.12" bytemuck = "1" nalgebra = { version = "0.34", features = ["encase"] } From 682f79f235eeae1354ccc7960fa2488f67fd1bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 17:46:31 +0100 Subject: [PATCH 05/12] =?UTF-8?q?chore:=E2=80=AFadd=20CI=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/ci.yaml | 162 ++++++++++++++++++++++++++++ .github/workflows/download_slang.sh | 109 +++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 .github/workflows/ci.yaml create mode 100755 .github/workflows/download_slang.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..fae9c22 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,162 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: --deny warnings + RUSTDOCFLAGS: --deny warnings + SLANG_TAG: 2025.18.2 + +jobs: + # Check formatting. + format: + name: Format + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + + - name: Run cargo fmt + run: cargo fmt --all -- --check + setup-slang: + strategy: + matrix: + os: [ubuntu-latest] + runs-on: ${{ matrix.os }} + outputs: + slang-dir: ${{ steps.setup.outputs.slang-dir }} # Pass SLANG_DIR to dependent jobs + slang-cache-key: ${{ steps.setup.outputs.slang-cache-key }} # Pass SLANG_DIR to dependent jobs + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Cache Slang + id: cache-slang + uses: actions/cache/restore@v4 # Restore first + with: + path: | + ~/.cache/slang # Matches script's default OUTPUT_DIR + key: slang-v$SLANG_TAG-${{ runner.os }}-${{ runner.arch }} + + - name: Setup Slang + id: setup + run: | + echo "version=$SLANG_TAG" >> $GITHUB_OUTPUT # Output for cache key + SLANG_DIR=$(./.github/workflows/download_slang.sh --version $SLANG_TAG | grep '^SLANG_DIR=' | cut -d'=' -f2-) + echo "slang-dir=$SLANG_DIR" >> $GITHUB_OUTPUT # Output for dependents + echo "slang-cache-key=slang-v$SLANG_TAG-${{ runner.os }}-${{ runner.arch }}" >> $GITHUB_OUTPUT + echo "SLANG_DIR=$SLANG_DIR" >> $GITHUB_ENV # For this job if needed + + - name: Save Slang Cache + if: steps.cache-slang.outputs.cache-hit != 'true' # Only save on miss + uses: actions/cache/save@v4 + with: + path: ~/.cache/slang + key: ${{ steps.setup.outputs.slang-cache-key }} + # Run clippy lints. + clippy: + needs: setup-slang # Depends on setup-slang + name: Clippy + runs-on: ubuntu-latest + env: + SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }} + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: clippy + + - name: Install dependencies + run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev + + - name: Retrieve Cache for Slang + uses: actions/cache/restore@v4 + with: + path: ~/.cache/slang + key: ${{ needs.setup-slang.outputs.slang-cache-key }} + + - name: Populate target directory from cache + uses: Leafwing-Studios/cargo-cache@v2 + with: + sweep-cache: true + + - name: Run clippy lints + run: SLANG_DIR=$SLANG_DIR cargo clippy --locked --workspace --all-targets -- --deny warnings + + # Check documentation. + doc: + needs: setup-slang # Depends on setup-slang + name: Docs + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Install dependencies + run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev + + - name: Retrieve Cache for Slang + uses: actions/cache/restore@v4 + with: + path: ~/.cache/slang + key: ${{ needs.setup-slang.outputs.slang-cache-key }} + + - name: Populate target directory from cache + uses: Leafwing-Studios/cargo-cache@v2 + with: + sweep-cache: true + + - name: Check documentation + run: SLANG_DIR=$SLANG_DIR cargo doc --locked --workspace --document-private-items --no-deps + # Testing. + test: + needs: setup-slang # Depends on setup-slang + name: Tests + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Install dependencies + run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev + + - name: Retrieve Cache for Slang + uses: actions/cache/restore@v4 + with: + path: ~/.cache/slang + key: ${{ needs.setup-slang.outputs.slang-cache-key }} + + - name: Populate target directory from cache + uses: Leafwing-Studios/cargo-cache@v2 + with: + sweep-cache: true + - name: Run Cargo Tests + run: | + SLANG_DIR=$SLANG_DIR cargo test --verbose \ No newline at end of file diff --git a/.github/workflows/download_slang.sh b/.github/workflows/download_slang.sh new file mode 100755 index 0000000..0d1738f --- /dev/null +++ b/.github/workflows/download_slang.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Default values +OS="" +OUTPUT_DIR="$HOME/.cache/slang" +SLANG_VERSION="" +SLANG_TAG="" +ASSET_SUFFIX="" +SLANG_URL_BASE="https://github.com/shader-slang/slang/releases/download" + +# Help message +usage() { + echo "Usage: $0 [--os ] [--output-dir ] [--version ]" + echo " --os: Target OS (default: auto-detect from current platform)" + echo " --output-dir: Directory to extract Slang (default: ~/.cache/slang)" + echo " --version: Slang version (e.g., 2025.18.2, default: latest)" + echo "Example: $0 --os linux --output-dir /tmp/slang" +} + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + --os) OS="$2"; shift ;; + --output-dir) export OUTPUT_DIR="$2"; shift ;; + --version) export SLANG_VERSION="$2"; shift ;; + *) usage ; exit 1 ;; + esac + shift +done + +# Detect OS if not specified +if [[ -z "$OS" ]]; then + case "$(uname -s)" in + Linux*) OS="linux" ;; + Darwin*) + if [[ "$(uname -m)" == "arm64" ]]; then + OS="macos-aarch64" + else + OS="macos" + fi + ;; + CYGWIN*|MINGW*|MSYS*) OS="windows" ;; + *) echo "Error: Unable to detect OS. Specify --os (linux, macos, macos-arm64, windows)"; exit 1 ;; + esac +fi + +# Determine asset suffix based on OS +case "$OS" in + linux) ASSET_SUFFIX="linux-x86_64.zip" ;; + macos) ASSET_SUFFIX="macos-x86_64.zip" ;; + macos-aarch64) ASSET_SUFFIX="macos-aarch64.zip" ;; + windows) ASSET_SUFFIX="windows-x86_64.zip" ;; + *) echo "Error: Unsupported OS: $OS"; exit 1 ;; +esac + +# Get Slang version if not specified +if [[ -z "$SLANG_VERSION" ]]; then + export SLANG_TAG=$(curl -s https://api.github.com/repos/shader-slang/slang/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') + export SLANG_VERSION=$(echo "$SLANG_TAG" | sed 's/v//') # e.g., v2025.18.2 -> 2025.18.2 +else + export SLANG_TAG="v$SLANG_VERSION" +fi + +if [[ -z "$SLANG_VERSION" ]]; then + echo "Error: Could not determine Slang version" + exit 1 +fi + +# Set up paths +SLANG_DIR="$OUTPUT_DIR/slang-v$SLANG_VERSION-$OS" +ZIP_URL="$SLANG_URL_BASE/$SLANG_TAG/slang-$SLANG_VERSION-$ASSET_SUFFIX" +TEMP_ZIP="/tmp/slang-$SLANG_VERSION.zip" + +# Check if Slang is already extracted +if [[ -d "$SLANG_DIR" ]] && [[ -f "$SLANG_DIR/bin/slangc" || -f "$SLANG_DIR/bin/slangc.exe" ]]; then + echo "Using existing Slang at $SLANG_DIR" + echo "SLANG_DIR=$SLANG_DIR" + exit 0 +fi + +# Download Slang release +echo "Downloading Slang v$SLANG_VERSION for $OS from $ZIP_URL..." +mkdir -p "$OUTPUT_DIR" +curl -L -o "$TEMP_ZIP" "$ZIP_URL" || { echo "Error: Download failed for $ZIP_URL"; exit 1; } + +# Extract based on OS +echo "Extracting to $SLANG_DIR..." +if [[ "$OS" == "windows" ]]; then + # Windows: Assume 7z is available (or adjust for PowerShell/Expand-Archive) + 7z x "$TEMP_ZIP" -o"$SLANG_DIR" -y > /dev/null || { echo "Error: Extraction failed"; rm -f "$TEMP_ZIP"; exit 1; } +else + # Linux/macOS: Use unzip + unzip -q "$TEMP_ZIP" -d "$SLANG_DIR" || { echo "Error: Extraction failed"; rm -f "$TEMP_ZIP"; exit 1; } +fi + +# Clean up +rm -f "$TEMP_ZIP" + +# Verify extraction +if [[ ! -f "$SLANG_DIR/bin/slangc" && ! -f "$SLANG_DIR/bin/slangc.exe" ]]; then + echo "Error: Extraction incomplete, slangc not found in $SLANG_DIR/bin" + exit 1 +fi + +echo "Slang v$SLANG_VERSION extracted to $SLANG_DIR" +echo "SLANG_DIR=$SLANG_DIR" + +# For use in calling script +export SLANG_DIR \ No newline at end of file From 9fbe21007b7e0f73ea8405a44508405348755c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 17:46:44 +0100 Subject: [PATCH 06/12] feat: update to slang-hal/minislang v0.2 --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9ba15e8..d8a02d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,8 +19,8 @@ nalgebra = { version = "0.34", features = ["encase"] } cudarc = { version = "0.16", optional = true } -minislang = "0.1" -slang-hal = { version = "0.1", features = ["derive"] } +minislang = "0.2" +slang-hal = { version = "0.2", features = ["derive"] } include_dir = "0.7" [dev-dependencies] From 3f8bdc2892633589139c7ffb6e95e3e0d923c387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 17:47:38 +0100 Subject: [PATCH 07/12] fead: fix svd3 for near-identity matrices --- shaders/stensor/geometry/svd3.slang | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/shaders/stensor/geometry/svd3.slang b/shaders/stensor/geometry/svd3.slang index 463eac9..fcce4c7 100644 --- a/shaders/stensor/geometry/svd3.slang +++ b/shaders/stensor/geometry/svd3.slang @@ -49,8 +49,10 @@ public struct Svd3 { // Constants used for calculation of givens quaternions static const float GAMMA = 5.828427124; // sqrt(8)+3; -static const float CSTAR = 0.923879532; // cos(pi/8) -static const float SSTAR = 0.3826834323; // sin(p/8) +static const float CSTAR = 1.0; // TODO: using no-identity values (below) breaks the SVD for near-identity matrices. +static const float SSTAR = 0.0; // TODO: using no-identity values (below) breaks the SVD for near-identity matrices. +//static const float CSTAR = 0.923879532; // cos(pi/8) +//static const float SSTAR = 0.3826834323; // sin(p/8) // Threshold value static const float SVD_EPSILON = 1e-6; // Iteration counts for Jacobi Eigenanalysis and reciprocal square root functions, influence precision From cc9253ed2013f2f4d70e1c5021d30458dd41f95f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 17:54:58 +0100 Subject: [PATCH 08/12] Release v0.2.0 --- CHANGELOG.md | 4 ++++ Cargo.toml | 2 +- README.md | 2 +- src/shapes.rs | 21 ++++++++++++++++++++- src/tensor.rs | 52 +++++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 74 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e69de29..31dd2fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -0,0 +1,4 @@ +# v0.2.0 (27 Oct. 2025) +- Update to slang-hal 0.2. +- Make rank-1 tensors resizeable. +- Fix svd3.slang retuning incorrect results for near-identity matrices. \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index d8a02d8..a9ace03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "stensor" authors = ["Sébastien Crozet "] description = "Cross-platform GPU tensor library with Slang and Rust." repository = "https://github.com/dimforge/stensor" -version = "0.1.1" +version = "0.2.0" edition = "2024" license = "Apache-2.0" diff --git a/README.md b/README.md index fae9cf6..fcff46d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ gpu". It aims (but it isn’t there yet) to expose linear algebra operations (in operations) as well as geometric types (quaternions, similarities, etc.) as Slang shaders and kernels. > **Warning** -**stensor** is still very incomplete and under heavy development and is lacking many features. +> **stensor** is still very incomplete and under heavy development and is lacking many features. See also the README of [slang-hal](https://github.com/dimforge/slang-hal/blob/main/README.md) for information on supported platforms. diff --git a/src/shapes.rs b/src/shapes.rs index 47619ef..4e56e37 100644 --- a/src/shapes.rs +++ b/src/shapes.rs @@ -6,19 +6,25 @@ use std::collections::hash_map::Entry; use std::sync::Mutex; use wgpu::BufferUsages; +/// GGML dimension index mapping: converts between GGML and stensor dimension ordering. pub const GGML_IDS: [usize; 4] = [1, 0, 2, 3]; +/// GGML dimension index mapping (u32 version). pub const GGML_IDS_U32: [u32; 4] = [1, 0, 2, 3]; +/// Specifies the memory layout of matrices. #[derive(Copy, Clone, PartialEq, Eq, Default, Debug, Hash)] pub enum MatrixOrdering { + /// Column-major ordering: elements in the same column are contiguous in memory. #[default] ColumnMajor, + /// Row-major ordering: elements in the same row are contiguous in memory. RowMajor, // TODO: should we generalize this to a `MajorAxis(i)` where any // dimension of the tensor can be interpreted as the main one? } impl MatrixOrdering { + /// Returns the transposed matrix ordering. pub fn transpose(self) -> Self { match self { Self::ColumnMajor => Self::RowMajor, @@ -40,6 +46,7 @@ pub struct ViewShape { } impl ViewShape { + /// Creates a contiguous view shape with the given size and ordering. pub fn contiguous(size: [u32; 4], ordering: MatrixOrdering) -> Self { let stride = match ordering { MatrixOrdering::ColumnMajor => { @@ -52,19 +59,23 @@ impl ViewShape { Self { size, stride } } + /// Returns a transposed view of this shape. pub fn transpose(&self) -> Self { self.permute([1, 0, 2, 3]) } + /// Conditionally transposes the shape based on the `transpose` parameter. pub fn maybe_transpose(&self, transpose: bool) -> Self { if transpose { self.transpose() } else { *self } } + /// Permutes the dimensions according to GGML's dimension ordering convention. pub fn permute_ggml(&self, mut permutations: [usize; 4]) -> Self { permutations.swap(0, 1); self.permute(permutations.map(|i| GGML_IDS[i])) } + /// Permutes the dimensions according to the given permutation array. pub fn permute(&self, permutations: [usize; 4]) -> Self { // Check all the permutation indices are valid and without // duplicate. @@ -138,9 +149,10 @@ impl ViewShape { } } + /// Checks if each dimension of this shape is a multiple of the corresponding dimension in `of`. pub fn is_multiple_of(&self, of: Self) -> bool { for k in 0..4 { - if self.size[k] % of.size[k] != 0 { + if !self.size[k].is_multiple_of(of.size[k]) { return false; } } @@ -148,6 +160,7 @@ impl ViewShape { true } + /// Creates a view with the specified shape and strides within this shape. pub fn view(&self, shape: [u32; DIM2], stride: [Option; DIM2]) -> Self { assert!(DIM2 <= 4); @@ -225,10 +238,12 @@ impl ViewShape { } } + /// Checks if this shape contains zero elements. pub fn is_empty(&self) -> bool { self.len() == 0 } + /// Returns the total number of elements in this shape. pub fn len(&self) -> u64 { (self.size[0] * self.size[1] * self.size[2] * self.size[3]) as u64 } @@ -256,11 +271,13 @@ impl ViewShapeBuffers { } } + /// Clears temporary shape buffers and recycles them for reuse. pub fn clear_tmp(&mut self) { let mut recycled = self.recycled.lock().unwrap(); recycled.extend(self.tmp_buffers.drain().map(|(_, buf)| buf)); } + /// Stores a temporary shape buffer for the given shape, creating one if needed. pub fn put_tmp(&mut self, backend: &B, shape: ViewShape) -> Result<(), B::Error> { if self.contains(shape) { return Ok(()); @@ -293,10 +310,12 @@ impl ViewShapeBuffers { backend.init_buffer(&[shape], usage | BufferUsages::STORAGE) } + /// Checks if a buffer for the given shape exists (permanent or temporary). pub fn contains(&self, shape: ViewShape) -> bool { self.buffers.contains_key(&shape) || self.tmp_buffers.contains_key(&shape) } + /// Inserts or retrieves a mutable buffer for the given shape. pub fn insert( &mut self, backend: &B, diff --git a/src/tensor.rs b/src/tensor.rs index 2812c6c..88ccad2 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -59,6 +59,7 @@ impl TensorBuilder { self.shape.into_iter().map(|s| s as u64).product() } + /// Sets the matrix ordering for this tensor. pub fn ordering(mut self, ordering: MatrixOrdering) -> Self { self.ordering = ordering; self @@ -148,6 +149,7 @@ impl TensorBuilder { }) } + /// Builds this tensor with an array of encase-encoded values given for its initial value. pub fn build_encased( self, backend: &B, @@ -170,8 +172,11 @@ impl TensorBuilder { } } +/// Type alias for a vector stored on the GPU. pub type GpuVector = GpuTensor; +/// Type alias for a matrix stored on the GPU. pub type GpuMatrix = GpuTensor; +/// Type alias for a scalar stored on the GPU. pub type GpuScalar = GpuTensor; /// A tensor stored in the GPU. @@ -183,20 +188,24 @@ pub struct GpuTensor { ordering: MatrixOrdering, } +/// Type alias for a tensor stored on the WebGPU backend. pub type WgpuTensor = GpuTensor; #[cfg(feature = "cuda")] pub type CudaTensor = GpuTensor; impl GpuTensor { + /// Returns the matrix ordering of this tensor. pub fn ordering(&self) -> MatrixOrdering { self.ordering } + /// Returns a transposed version of this tensor. pub fn transposed(mut self) -> Self { self.transpose(); self } + /// Transposes this tensor in place. pub fn transpose(&mut self) { self.shape.swap(0, 1); self.ordering = self.ordering.transpose(); @@ -575,10 +584,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { self.view_shape.stride[GGML_IDS[i]] } + /// Returns a transposed view of this tensor. pub fn transposed(&self) -> Self { self.permute([1, 0, 2, 3]) } + /// Permutes the dimensions of this view according to the given permutation array. pub fn permute(&self, permutations: [usize; 4]) -> Self { Self { view_shape: self.view_shape.permute(permutations), @@ -587,6 +598,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } + /// Permutes the dimensions according to GGML's dimension ordering convention. pub fn permute_ggml(&self, permutations: [usize; 4]) -> Self { Self { view_shape: self.view_shape.permute_ggml(permutations), @@ -595,7 +607,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } - // Specify the ordering explicitly to avoid ambiguities if the original shape has 1 row and 1 col. + /// Reshapes this view with an explicit ordering to avoid ambiguities. + /// + /// This is useful when the original shape has 1 row and 1 column. pub fn reshape_with_ordering( &self, shape: [u32; DIM2], @@ -608,10 +622,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { self.view(0, shape4, view_shape.stride.map(Some)) } + /// Reshapes this view to the specified shape, preserving the matrix ordering. pub fn reshape(&self, shape: [u32; DIM2]) -> Self { self.view(0, shape, [None; DIM2]) } + /// Reshapes this view using GGML's dimension ordering convention. pub fn reshape_ggml(&self, mut shape: [u32; DIM2]) -> Self { shape.swap(0, 1); @@ -623,7 +639,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } - // Specify the ordering explicitly to avoid ambiguities if the original shape has 1 row and 1 col. + /// Reshapes this view using GGML's ordering with an explicit matrix ordering. + /// + /// This is useful to avoid ambiguities when the original shape has 1 row and 1 column. pub fn reshape_ggml_with_ordering( &self, mut shape: [u32; DIM2], @@ -633,6 +651,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { self.reshape_with_ordering(shape, ordering) } + /// Creates a view of a sub-tensor with the specified offset, shape, and optional strides. pub fn view( &self, mut offset: u32, @@ -656,6 +675,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } + /// Creates a view using GGML's dimension ordering convention. pub fn view_ggml( &self, offset: u32, @@ -667,6 +687,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { self.view(offset, shape, stride) } + /// Returns a view of the `matrix_id`-th matrix in this tensor. pub fn matrix(&self, matrix_id: u32) -> Self { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(matrix_id < nmats); @@ -681,6 +702,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } + /// Returns a view containing `new_ncols` columns starting from `first_col`. pub fn columns(&self, first_col: u32, new_ncols: u32) -> Self { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(first_col + new_ncols < ncols); @@ -694,10 +716,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } + /// Returns a view of the specified column. pub fn column(&self, col: u32) -> Self { self.columns(col, 1) } + /// Returns a view containing `new_nrows` rows starting from `first_row`. pub fn rows(&self, first_row: u32, new_nrows: u32) -> Self { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(first_row + new_nrows < nrows); @@ -711,12 +735,14 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { } } + /// Returns a view of the specified row. pub fn row(&self, row: u32) -> Self { self.rows(row, 1) } } impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { + /// Converts this mutable view into an immutable view. pub fn as_ref(&self) -> GpuTensorView<'_, T, B> { GpuTensorView { view_shape: self.view_shape, @@ -765,10 +791,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { self.view_shape.len() } + /// Returns a transposed mutable view of this tensor. pub fn transposed(&mut self) -> GpuTensorViewMut<'_, T, B> { self.permute([1, 0, 2, 3]) } + /// Permutes the dimensions of this mutable view according to the given permutation array. pub fn permute(&mut self, permutations: [usize; 4]) -> GpuTensorViewMut<'_, T, B> { GpuTensorViewMut { view_shape: self.view_shape.permute(permutations), @@ -777,10 +805,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { } } + /// Reshapes this mutable view to the specified shape. pub fn reshape(&mut self, shape: [u32; DIM2]) -> GpuTensorViewMut<'_, T, B> { self.view(0, shape, [None; DIM2]) } + /// Creates a mutable view of a sub-tensor with the specified offset, shape, and optional strides. pub fn view( &mut self, mut offset: u32, @@ -804,6 +834,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { } } + /// Returns a mutable view of the `matrix_id`-th matrix in this tensor. pub fn matrix(&mut self, matrix_id: u32) -> GpuTensorViewMut<'_, T, B> { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(matrix_id < nmats); @@ -818,6 +849,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { } } + /// Returns a mutable view containing `new_ncols` columns starting from `first_col`. pub fn columns(&mut self, first_col: u32, new_ncols: u32) -> GpuTensorViewMut<'_, T, B> { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(first_col + new_ncols < ncols); @@ -831,10 +863,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { } } + /// Returns a mutable view of the specified column. pub fn column(&mut self, col: u32) -> GpuTensorViewMut<'_, T, B> { self.columns(col, 1) } + /// Returns a mutable view containing `new_nrows` rows starting from `first_row`. pub fn rows(&mut self, first_row: u32, new_nrows: u32) -> GpuTensorViewMut<'_, T, B> { let [nrows, ncols, nmats, ncubes] = self.view_shape.size; assert!(first_row + new_nrows < nrows); @@ -848,29 +882,35 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { } } + /// Returns a mutable view of the specified row. pub fn row(&mut self, row: u32) -> GpuTensorViewMut<'_, T, B> { self.rows(row, 1) } } impl GpuTensor { + /// Reshapes this tensor to the specified shape. pub fn reshape(&self, shape: [u32; DIM2]) -> GpuTensorView<'_, T, B> { self.as_view().reshape_with_ordering(shape, self.ordering) } + /// Reshapes this tensor using GGML's dimension ordering convention. pub fn reshape_ggml(&self, shape: [u32; DIM2]) -> GpuTensorView<'_, T, B> { self.as_view() .reshape_ggml_with_ordering(shape, self.ordering) } + /// Permutes the dimensions of this tensor according to the given permutation array. pub fn permute(&self, permutations: [usize; 4]) -> GpuTensorView<'_, T, B> { self.as_view().permute(permutations) } + /// Permutes the dimensions according to GGML's dimension ordering convention. pub fn permute_ggml(&self, permutations: [usize; 4]) -> GpuTensorView<'_, T, B> { self.as_view().permute_ggml(permutations) } + /// Creates a view of a sub-tensor with the specified offset, shape, and optional strides. pub fn view( &self, offset: u32, @@ -880,6 +920,7 @@ impl GpuTensor { self.as_view().view(offset, shape, stride) } + /// Creates a view using GGML's dimension ordering convention. pub fn view_ggml( &self, offset: u32, @@ -894,14 +935,17 @@ impl GpuTensor { self.as_view().column(i) } + /// Returns a view containing `ncols` columns starting from `first_col`. pub fn columns(&self, first_col: u32, ncols: u32) -> GpuTensorView<'_, T, B> { self.as_view().columns(first_col, ncols) } + /// Returns a view of the specified row. pub fn row(&self, i: u32) -> GpuTensorView<'_, T, B> { self.as_view().row(i) } + /// Returns a view containing `nrows` rows starting from `first_row`. pub fn rows(&self, first_row: u32, nrows: u32) -> GpuTensorView<'_, T, B> { self.as_view().rows(first_row, nrows) } @@ -1099,7 +1143,7 @@ macro_rules! append_and_remove( /// /// Panics if the tensor isn’t a vector. The tensor is a vector if: /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where - /// `*` is any non-zero positive integer). + /// `*` is any non-zero positive integer). /// - It is a column-major tensor and its size is made of a single column. Its size is /// `[*, 1, 1, 1]` (where `*` is any non-zero positive integer). /// @@ -1149,7 +1193,7 @@ macro_rules! append_and_remove( /// /// Panics if the tensor isn’t a vector. The tensor is a vector if: /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where - /// `*` is any non-zero positive integer). + /// `*` is any non-zero positive integer). /// - It is a column-major tensor and its size is made of a single column. Its size is /// `[*, 1, 1, 1]` (where `*` is any non-zero positive integer). /// From ce3935732c7c103fbcaa8155433bede21c72f3ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 17:55:55 +0100 Subject: [PATCH 09/12] chore: cargo fmt --- examples/gemm_bench.rs | 4 ++-- src/tensor.rs | 47 +++++++++++++++++++++++++++++------------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/examples/gemm_bench.rs b/examples/gemm_bench.rs index 12954bc..2ca5225 100644 --- a/examples/gemm_bench.rs +++ b/examples/gemm_bench.rs @@ -5,9 +5,9 @@ use nalgebra::DMatrix; use slang_hal::Shader; use slang_hal::backend::WebGpu; use slang_hal::backend::{Backend, Encoder}; -use slang_hal::shapes::ViewShapeBuffers; -use slang_hal::tensor::{GpuTensor, TensorBuilder}; use stensor::linalg::{Gemm, GemmVariant}; +use stensor::shapes::ViewShapeBuffers; +use stensor::tensor::GpuTensor; use wgpu::{BufferUsages, Features, Limits}; #[async_std::main] diff --git a/src/tensor.rs b/src/tensor.rs index 88ccad2..22e51ce 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -4,7 +4,7 @@ // TODO: feels like this should be in stensor instead of slang-hal use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape}; -use bytemuck::{NoUninit}; +use bytemuck::NoUninit; use encase::ShaderType; use nalgebra::{Dim, IsContiguous, Matrix, Storage}; use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding}; @@ -229,14 +229,18 @@ impl GpuTensor { /// The maximum number of elements this tensor can hold without needing a resize of the /// underlying GPU buffer. pub fn capacity(&self) -> u64 - where T: NoUninit { + where + T: NoUninit, + { self.buffer.len() as u64 } /// The maximum number of elements this tensor can hold without needing a resize of the /// underlying GPU buffer. pub fn capacity_encased(&self) -> u64 - where T: EncaseType { + where + T: EncaseType, + { self.buffer.len_encased() as u64 } @@ -531,7 +535,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> { /// /// If it matches, returns the tensor's matrix ordering. pub fn is_entire_tensor(&self) -> Option - where T: NoUninit { + where + T: NoUninit, + { if self.buffer.len() == self.len() as usize && self.offset == 0 { self.is_contiguous() } else { @@ -762,7 +768,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> { /// /// If it matches, returns the tensor's matrix ordering. pub fn is_entire_tensor(&self) -> Option - where T: NoUninit { + where + T: NoUninit, + { self.as_ref().is_entire_tensor() } @@ -1023,11 +1031,7 @@ impl GpuTensor { /// # Safety /// /// The returned buffer must be initialized before being read from. - pub fn vector_uninit( - backend: &B, - len: u32, - usage: BufferUsages, - ) -> Result + pub fn vector_uninit(backend: &B, len: u32, usage: BufferUsages) -> Result where T: DeviceValue + NoUninit, { @@ -1136,7 +1140,6 @@ impl<'b, B: Backend, T: DeviceValue> ShaderArgs<'b, B> for GpuTensor { } } - macro_rules! append_and_remove( ($append: ident, $shift_remove: ident, $TraitBound: ident, $capacity: ident, $copy_buffer_to_buffer: ident, $uninit_buffer: ident, $write_buffer: ident) => { /// Append the `data` elements at the end of this tensor if it is a vector. @@ -1277,6 +1280,22 @@ macro_rules! append_and_remove( ); impl GpuTensor { - append_and_remove!(append, shift_remove, NoUninit, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer); - append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased); -} \ No newline at end of file + append_and_remove!( + append, + shift_remove, + NoUninit, + capacity, + copy_buffer_to_buffer, + uninit_buffer, + write_buffer + ); + append_and_remove!( + append_encased, + shift_remove_encased, + EncaseType, + capacity_encased, + copy_buffer_to_buffer_encased, + uninit_buffer_encased, + write_buffer_encased + ); +} From ad4dce8c7a846d918e541c41f85703776132bfe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 18:02:26 +0100 Subject: [PATCH 10/12] fix clippy & docs --- README.md | 2 +- examples/gemm_bench.rs | 3 +-- src/linalg/contiguous.rs | 3 ++- src/linalg/gemm.rs | 3 ++- src/linalg/gemv.rs | 3 ++- src/linalg/op_assign.rs | 3 ++- src/linalg/reduce.rs | 3 ++- src/tensor.rs | 4 +--- 8 files changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index fcff46d..b06e227 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ supported platforms. ### Using Slang In order to compile and run any slang project, be sure to define the `SLANG_DIR` environment variable: -1. Download the Slang compiler libraries for your platform: https://github.com/shader-slang/slang/releases/tag/v2025.16 +1. Download the Slang compiler libraries for your platform: 2. Unzip the downloaded directory, and use its path as value to the `SLANG_DIR` environment variable: `SLANG_DIR=/path/to/slang`. Note that the variable must point to the root of the slang installation (i.e. the directory that contains `bin` and `lib`). We recommend adding that as a system-wide environment variables so that it also becomes available to your IDE. \ No newline at end of file diff --git a/examples/gemm_bench.rs b/examples/gemm_bench.rs index 2ca5225..b7cbd46 100644 --- a/examples/gemm_bench.rs +++ b/examples/gemm_bench.rs @@ -1,4 +1,3 @@ -use approx::assert_relative_eq; use indexmap::IndexMap; use minislang::SlangCompiler; use nalgebra::DMatrix; @@ -105,7 +104,7 @@ async fn run_gemm( drop(pass); // Ensure the pass is ended before the encoder is borrowed again. backend.submit(encoder)?; - backend.synchronize(); + backend.synchronize()?; timing[i] = t0.elapsed().as_secs_f32(); backend .slow_read_buffer(result.buffer(), gpu_result.as_mut_slice()) diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs index d3bafaa..88d3c84 100644 --- a/src/linalg/contiguous.rs +++ b/src/linalg/contiguous.rs @@ -85,7 +85,8 @@ mod test { } async fn gpu_contiguous_generic(backend: impl Backend) { - let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]); + let mut compiler = SlangCompiler::new(vec![]); + crate::register_shaders(&mut compiler); let contiguous = super::Contiguous::from_backend(&backend, &compiler).unwrap(); let mut shapes = ViewShapeBuffers::new(&backend); diff --git a/src/linalg/gemm.rs b/src/linalg/gemm.rs index fa2f2b5..4add4e4 100644 --- a/src/linalg/gemm.rs +++ b/src/linalg/gemm.rs @@ -237,7 +237,8 @@ mod test { } async fn gpu_gemm_generic(backend: impl Backend) { - let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]); + let mut compiler = SlangCompiler::new(vec![]); + crate::register_shaders(&mut compiler); let gemm = super::Gemm::from_backend(&backend, &compiler).unwrap(); let mut shapes = ViewShapeBuffers::new(&backend); diff --git a/src/linalg/gemv.rs b/src/linalg/gemv.rs index bc4df73..4705782 100644 --- a/src/linalg/gemv.rs +++ b/src/linalg/gemv.rs @@ -349,7 +349,8 @@ mod test { } async fn gpu_gemv_generic(backend: impl Backend) { - let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]); + let mut compiler = SlangCompiler::new(vec![]); + crate::register_shaders(&mut compiler); let gemv = super::Gemv::from_backend(&backend, &compiler).unwrap(); let mut shapes = ViewShapeBuffers::new(&backend); diff --git a/src/linalg/op_assign.rs b/src/linalg/op_assign.rs index 97cb00d..55f0db2 100644 --- a/src/linalg/op_assign.rs +++ b/src/linalg/op_assign.rs @@ -187,7 +187,8 @@ mod test { OpAssignVariant::Div, OpAssignVariant::Copy, ]; - let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]); + let mut compiler = SlangCompiler::new(vec![]); + crate::register_shaders(&mut compiler); let op_assign = super::OpAssign::from_backend(&backend, &compiler).unwrap(); diff --git a/src/linalg/reduce.rs b/src/linalg/reduce.rs index e39de6a..ae5b908 100644 --- a/src/linalg/reduce.rs +++ b/src/linalg/reduce.rs @@ -93,7 +93,8 @@ mod test { ReduceVariant::Prod, ReduceVariant::SqNorm, ]; - let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]); + let mut compiler = SlangCompiler::new(vec![]); + crate::register_shaders(&mut compiler); let reduce = super::Reduce::from_backend(&backend, &compiler).unwrap(); diff --git a/src/tensor.rs b/src/tensor.rs index 22e51ce..c94262c 100644 --- a/src/tensor.rs +++ b/src/tensor.rs @@ -1040,7 +1040,7 @@ impl GpuTensor { /// Allocates a new vector on the gpu initialized from `vector`. /// - /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead. + /// If `T` does not implement `NoUninit`, use [`GpuTensor::vector_encased`] instead. pub fn vector( backend: &B, vector: impl AsRef<[T]>, @@ -1069,8 +1069,6 @@ impl GpuTensor { } /// Allocates a new vector on the gpu initialized from `vector`. - /// - /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead. pub fn vector_encased( backend: &B, vector: impl AsRef<[T]>, From 30c25eeb6a9f40fa39f07f3506572e2c8eacb5d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 18:06:34 +0100 Subject: [PATCH 11/12] fix tests --- shaders/stensor/linalg/gemm.slang | 2 -- src/linalg/contiguous.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/shaders/stensor/linalg/gemm.slang b/shaders/stensor/linalg/gemm.slang index a69c92c..315fbd4 100644 --- a/shaders/stensor/linalg/gemm.slang +++ b/shaders/stensor/linalg/gemm.slang @@ -26,8 +26,6 @@ void gemm_fast( ) { let local_id = local_id.y; - out[0] = 1.0; - for (var k = 0u; k < shape_m2.ncols; k += 4u) { var sum = float4x4(0.0); diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs index 88d3c84..3cd66bc 100644 --- a/src/linalg/contiguous.rs +++ b/src/linalg/contiguous.rs @@ -112,8 +112,8 @@ mod test { &backend, &mut shapes, &mut pass, - gpu_tensor.as_view().transposed(), &gpu_output, + gpu_tensor.as_view().transposed(), ) .unwrap(); drop(pass); // Ensure the pass is ended before the encoder is borrowed again. From dca91bcba7136f97427dd7f92de3311f2e5b20bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= Date: Mon, 27 Oct 2025 18:09:46 +0100 Subject: [PATCH 12/12] fix CI --- .github/workflows/ci.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fae9c22..f377112 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -145,7 +145,11 @@ jobs: uses: dtolnay/rust-toolchain@stable - name: Install dependencies - run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev + run: | + sudo apt-get update + sudo apt-get install --no-install-recommends -y \ + build-essential curl wget file libssl-dev \ + libegl1-mesa-dev libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers - name: Retrieve Cache for Slang uses: actions/cache/restore@v4 @@ -159,4 +163,4 @@ jobs: sweep-cache: true - name: Run Cargo Tests run: | - SLANG_DIR=$SLANG_DIR cargo test --verbose \ No newline at end of file + SLANG_DIR=$SLANG_DIR LIBGL_ALWAYS_SOFTWARE=1 cargo test --verbose \ No newline at end of file