From fe80efc55c38528e032ab9f671fa8bc420b31e4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 8 Oct 2025 14:58:59 +0200
Subject: [PATCH 01/12] =?UTF-8?q?feat:=E2=80=AFadd=20support=20for=20resiz?=
 =?UTF-8?q?ing=20tensors?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 build.rs                        |   1 -
 shaders/stensor/utils/mat.slang |   0
 src/lib.rs                      |   2 +-
 src/linalg/contiguous.rs        |   8 +-
 src/linalg/gemm.rs              |   8 +-
 src/linalg/gemv.rs              |   8 +-
 src/linalg/op_assign.rs         |   8 +-
 src/linalg/reduce.rs            |   4 +-
 src/linalg/repeat.rs            |   4 +-
 src/shapes.rs                   |   2 +-
 src/tensor.rs                   | 215 +++++++++++++++++++++++++++++---
 11 files changed, 218 insertions(+), 42 deletions(-)
 create mode 100644 shaders/stensor/utils/mat.slang
diff --git a/build.rs b/build.rs
index abd5388..ac0db9d 100644
--- a/build.rs
+++ b/build.rs
@@ -15,4 +15,3 @@ pub fn main() {
         slang.compile_all(target, "../shaders", "./src/autogen", &[]);
     }
 }
-
diff --git a/shaders/stensor/utils/mat.slang b/shaders/stensor/utils/mat.slang
new file mode 100644
index 0000000..e69de29
diff --git a/src/lib.rs b/src/lib.rs
index edcab79..2eed99e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,8 +10,8 @@ use minislang::SlangCompiler;
 
 pub mod geometry;
 pub mod linalg;
-pub mod tensor;
 pub mod shapes;
+pub mod tensor;
 
 // pub mod utils;
 
diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs
index 32be4dd..d3bafaa 100644
--- a/src/linalg/contiguous.rs
+++ b/src/linalg/contiguous.rs
@@ -1,7 +1,7 @@
-use slang_hal::backend::Backend;
-use slang_hal::function::GpuFunction;
 use crate::shapes::{MatrixOrdering, ViewShape, ViewShapeBuffers};
 use crate::tensor::GpuTensorView;
+use slang_hal::backend::Backend;
+use slang_hal::function::GpuFunction;
 use slang_hal::{Shader, ShaderArgs};
 
 #[derive(Shader)]
@@ -58,13 +58,13 @@ impl<B: Backend> Contiguous<B> {
 
 #[cfg(test)]
 mod test {
+    use crate::shapes::ViewShapeBuffers;
+    use crate::tensor::GpuTensor;
     use minislang::SlangCompiler;
     use nalgebra::DMatrix;
     use slang_hal::Shader;
     use slang_hal::backend::WebGpu;
     use slang_hal::backend::{Backend, Encoder};
-    use crate::shapes::ViewShapeBuffers;
-    use crate::tensor::GpuTensor;
     use wgpu::{BufferUsages, Features, Limits};
 
     #[futures_test::test]
diff --git a/src/linalg/gemm.rs b/src/linalg/gemm.rs
index 95a522a..fa2f2b5 100644
--- a/src/linalg/gemm.rs
+++ b/src/linalg/gemm.rs
@@ -1,7 +1,7 @@
-use slang_hal::backend::Backend;
-use slang_hal::function::GpuFunction;
 use crate::shapes::{ViewShape, ViewShapeBuffers};
 use crate::tensor::GpuTensorView;
+use slang_hal::backend::Backend;
+use slang_hal::function::GpuFunction;
 use slang_hal::{Shader, ShaderArgs};
 
 #[derive(Shader)]
@@ -197,13 +197,13 @@ impl<B: Backend> Gemm<B> {
 #[cfg(test)]
 mod test {
     use crate::GemmVariant;
+    use crate::shapes::ViewShapeBuffers;
+    use crate::tensor::GpuTensor;
     use approx::relative_eq;
     use minislang::SlangCompiler;
     use nalgebra::DMatrix;
     use slang_hal::Shader;
     use slang_hal::backend::{Backend, Encoder, WebGpu};
-    use crate::shapes::ViewShapeBuffers;
-    use crate::tensor::GpuTensor;
     use wgpu::{BufferUsages, Features, Limits};
 
     #[futures_test::test]
diff --git a/src/linalg/gemv.rs b/src/linalg/gemv.rs
index 66e6c6b..bc4df73 100644
--- a/src/linalg/gemv.rs
+++ b/src/linalg/gemv.rs
@@ -1,7 +1,7 @@
-use slang_hal::backend::Backend;
-use slang_hal::function::GpuFunction;
 use crate::shapes::{MatrixOrdering, ViewShape, ViewShapeBuffers};
 use crate::tensor::GpuTensorView;
+use slang_hal::backend::Backend;
+use slang_hal::function::GpuFunction;
 use slang_hal::{Shader, ShaderArgs};
 
 /// Indicates if a matrix needs to be considered as-is or as its transpose when running a matrix
@@ -308,14 +308,14 @@ impl<B: Backend> Gemv<B> {
 #[cfg(test)]
 mod test {
     use crate::GemvVariant;
+    use crate::shapes::ViewShapeBuffers;
+    use crate::tensor::GpuTensor;
     use approx::assert_relative_eq;
     use minislang::SlangCompiler;
     use nalgebra::{DMatrix, DVector};
     use slang_hal::Shader;
     use slang_hal::backend::WebGpu;
     use slang_hal::backend::{Backend, Encoder};
-    use crate::shapes::ViewShapeBuffers;
-    use crate::tensor::GpuTensor;
     use wgpu::{BufferUsages, Features, Limits};
 
     #[futures_test::test]
diff --git a/src/linalg/op_assign.rs b/src/linalg/op_assign.rs
index 5398556..97cb00d 100644
--- a/src/linalg/op_assign.rs
+++ b/src/linalg/op_assign.rs
@@ -1,7 +1,7 @@
-use slang_hal::backend::Backend;
-use slang_hal::function::GpuFunction;
 use crate::shapes::{ViewShape, ViewShapeBuffers};
 use crate::tensor::GpuTensorView;
+use slang_hal::backend::Backend;
+use slang_hal::function::GpuFunction;
 use slang_hal::{Shader, ShaderArgs};
 
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@@ -155,13 +155,13 @@ impl<B: Backend> OpAssign<B> {
 #[cfg(test)]
 mod test {
     use super::{BinOpArgs, OpAssignVariant};
+    use crate::shapes::ViewShapeBuffers;
+    use crate::tensor::GpuTensor;
     use minislang::SlangCompiler;
     use nalgebra::DVector;
     use slang_hal::backend::WebGpu;
     use slang_hal::backend::{Backend, Buffer, Encoder};
     use slang_hal::shader::Shader;
-    use crate::shapes::ViewShapeBuffers;
-    use crate::tensor::GpuTensor;
     use wgpu::BufferUsages;
 
     #[futures_test::test]
diff --git a/src/linalg/reduce.rs b/src/linalg/reduce.rs
index a6f06bc..e39de6a 100644
--- a/src/linalg/reduce.rs
+++ b/src/linalg/reduce.rs
@@ -53,14 +53,14 @@ pub struct Reduce<B: Backend> {
 #[cfg(test)]
 mod test {
     use super::ReduceVariant;
+    use crate::shapes::{ViewShape, ViewShapeBuffers};
+    use crate::tensor::GpuTensor;
     use minislang::SlangCompiler;
     use nalgebra::DVector;
     use slang_hal::ShaderArgs;
     use slang_hal::backend::WebGpu;
     use slang_hal::backend::{Backend, Encoder};
     use slang_hal::shader::Shader;
-    use crate::shapes::{ViewShape, ViewShapeBuffers};
-    use crate::tensor::GpuTensor;
     use wgpu::BufferUsages;
 
     #[derive(ShaderArgs)]
diff --git a/src/linalg/repeat.rs b/src/linalg/repeat.rs
index 319c057..c5f297b 100644
--- a/src/linalg/repeat.rs
+++ b/src/linalg/repeat.rs
@@ -1,7 +1,7 @@
-use slang_hal::backend::Backend;
-use slang_hal::function::GpuFunction;
 use crate::shapes::{ViewShape, ViewShapeBuffers};
 use crate::tensor::GpuTensorView;
+use slang_hal::backend::Backend;
+use slang_hal::function::GpuFunction;
 use slang_hal::{Shader, ShaderArgs};
 
 /// Slang module for replicating the content of a source tensor as many times as possible to fill
diff --git a/src/shapes.rs b/src/shapes.rs
index b9baae3..47619ef 100644
--- a/src/shapes.rs
+++ b/src/shapes.rs
@@ -268,7 +268,7 @@ impl<B: Backend> ViewShapeBuffers<B> {
 
         let mut recycled = self.recycled.lock().unwrap();
         let buffer = if let Some(mut buffer) = recycled.pop() {
-            backend.write_buffer(&mut buffer, &[shape])?;
+            backend.write_buffer(&mut buffer, 0, &[shape])?;
             buffer
         } else {
             // println!("Couldn’t find recycling for {:?}", shape);
diff --git a/src/tensor.rs b/src/tensor.rs
index 6cbb2f4..c486ef6 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -3,19 +3,20 @@
 
 // TODO: feels like this should be in stensor instead of slang-hal
 
-use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding};
 use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape};
 use bytemuck::Pod;
 use encase::ShaderType;
 use nalgebra::{Dim, IsContiguous, Matrix, Storage};
+use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding};
+use std::ops::{Bound, RangeBounds};
 use std::sync::Arc;
 
 use slang_hal::backend::WebGpu;
 use wgpu::BufferUsages;
 
-use slang_hal::ShaderArgs;
 #[cfg(feature = "cuda")]
 use crate::cuda::Cuda;
+use slang_hal::ShaderArgs;
 use slang_hal::shader::ShaderArgsError;
 
 /// Helper struct for creating gpu storage buffers (scalars, vectors, matrices, tensors).
@@ -69,16 +70,12 @@ impl TensorBuilder {
         self
     }
 
-    /// Builds the gpu tensor.
-    ///
-    /// # Safety
-    ///
-    /// The returned buffer must be initialized before being read from.
-    pub unsafe fn build_uninit<T: DeviceValue + Pod, B: Backend>(
+    /// Builds the uninitialized gpu tensor.
+    pub fn build_uninit<T: DeviceValue + Pod, B: Backend>(
         self,
         backend: &B,
     ) -> Result<GpuTensor<T, B>, B::Error> {
-        let buffer = unsafe { backend.uninit_buffer(self.len() as usize, self.usage)? };
+        let buffer = backend.uninit_buffer(self.len() as usize, self.usage)?;
         Ok(GpuTensor {
             shape: self.shape,
             buffer,
@@ -91,11 +88,11 @@ impl TensorBuilder {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn build_uninit_encased<T: DeviceValue + EncaseType, B: Backend>(
+    pub fn build_uninit_encased<T: DeviceValue + EncaseType, B: Backend>(
         self,
         backend: &B,
     ) -> Result<GpuTensor<T, B>, B::Error> {
-        let buffer = unsafe { backend.uninit_buffer_encased(self.len() as usize, self.usage)? };
+        let buffer = backend.uninit_buffer_encased(self.len() as usize, self.usage)?;
         Ok(GpuTensor {
             shape: self.shape,
             buffer,
@@ -215,6 +212,25 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         self.shape.into_iter().map(|s| s as u64).product()
     }
 
+    /// The maximum number of elements this tensor can hold without needing a resize of the
+    /// underlying GPU buffer.
+    pub fn capacity(&self) -> u64
+    where T: Pod {
+        self.buffer.len() as u64
+    }
+
+    /// The maximum number of elements this tensor can hold without needing a resize of the
+    /// underlying GPU buffer.
+    pub fn capacity_encased(&self) -> u64
+    where T: EncaseType {
+        self.buffer.len_encased() as u64
+    }
+
+    /// The tensor’s order (i.e. the number of dimensions with a size > 1).
+    pub fn order(&self) -> u8 {
+        self.shape.iter().map(|s| (*s > 1) as u8).sum()
+    }
+
     /// Size of this tensor along the dimension `i`.
     pub fn size(&self, i: usize) -> u32 {
         self.shape[i]
@@ -354,6 +370,20 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         }
     }
 
+    fn vector_dim(&self) -> usize {
+        let dim = match self.ordering {
+            MatrixOrdering::RowMajor => 1,
+            MatrixOrdering::ColumnMajor => 0,
+        };
+        let mut required_shape = [1; 4];
+        required_shape[dim] = self.shape[dim];
+        assert_eq!(
+            required_shape, self.shape,
+            "Operation only supported on vector tensors."
+        );
+        dim
+    }
+
     // /// Reads the buffer’s content into a vector.
     // pub async fn read_bytes<'a>(&'a self, device: &'a Device) -> anyhow::Result<BufferView<'a>> {
     //     // TODO: could probably be optimized?
@@ -486,7 +516,8 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
     /// its underlying `GpuTensor`.
     ///
     /// If it matches, returns the tensor's matrix ordering.
-    pub fn is_entire_tensor(&self) -> Option<MatrixOrdering> {
+    pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
+    where T: Pod {
         if self.buffer.len() == self.len() as usize && self.offset == 0 {
             self.is_contiguous()
         } else {
@@ -699,7 +730,8 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
     /// its underlying `GpuTensor`.
     ///
     /// If it matches, returns the tensor's matrix ordering.
-    pub fn is_entire_tensor(&self) -> Option<MatrixOrdering> {
+    pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
+    where T: Pod {
         self.as_ref().is_entire_tensor()
     }
 
@@ -942,7 +974,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn vector_uninit(
+    pub fn vector_uninit(
         backend: &B,
         len: u32,
         usage: BufferUsages,
@@ -950,7 +982,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     where
         T: DeviceValue + Pod,
     {
-        unsafe { TensorBuilder::vector(len, usage).build_uninit(backend) }
+        TensorBuilder::vector(len, usage).build_uninit(backend)
     }
 
     /// Allocates a new vector on the gpu initialized from `vector`.
@@ -972,7 +1004,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn vector_uninit_encased(
+    pub fn vector_uninit_encased(
         backend: &B,
         len: u32,
         usage: BufferUsages,
@@ -980,7 +1012,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     where
         T: DeviceValue + EncaseType,
     {
-        unsafe { TensorBuilder::vector(len, usage).build_uninit_encased(backend) }
+        TensorBuilder::vector(len, usage).build_uninit_encased(backend)
     }
 
     /// Allocates a new vector on the gpu initialized from `vector`.
@@ -1017,11 +1049,11 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn scalar_uninit_encased(backend: &B, usage: BufferUsages) -> Result<Self, B::Error>
+    pub fn scalar_uninit_encased(backend: &B, usage: BufferUsages) -> Result<Self, B::Error>
     where
         T: DeviceValue + EncaseType,
     {
-        unsafe { TensorBuilder::scalar(usage).build_uninit_encased(backend) }
+        TensorBuilder::scalar(usage).build_uninit_encased(backend)
     }
 
     /// Allocates a new gpu storage buffer with a single element initialized to `value`.
@@ -1054,3 +1086,148 @@ impl<'b, B: Backend, T: DeviceValue> ShaderArgs<'b, B> for GpuTensor<T, B> {
         self.buffer.write_arg(binding, name, dispatch)
     }
 }
+
+
+macro_rules! append_and_remove(
+    ($append: ident, $shift_remove: ident, $TraitBound: ident, $capacity: ident, $copy_buffer_to_buffer: ident, $uninit_buffer: ident, $write_buffer: ident) => {
+        /// Append the `data` elements at the end of this tensor if it is a vector.
+        ///
+        /// Panics if the tensor isn’t a vector. The tensor is a vector if:
+        /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where
+        ///  `*` is any non-zero positive integer).
+        /// - It is a column-major tensor and its size is made of a single column. Its size is
+        ///   `[*, 1, 1, 1]` (where `*` is any non-zero positive integer).
+        ///
+        /// If the underlying GPU buffer is too small to contain the extra elements, it is automatically
+        /// resized. If a resize happens, the tensor’s capacity is the next power of two sufficient
+        /// to contain the appended data.
+        // TODO: broadcast automatically to generalize to any tensor order.
+        pub fn $append(&mut self, backend: &B, data: &[T]) -> Result<(), B::Error>
+        where
+            T: $TraitBound,
+        {
+            let dim_to_grow = self.vector_dim();
+            let num_added = data.len();
+            let curr_len = self.shape[dim_to_grow];
+            let new_len = curr_len + num_added as u32;
+
+            let mut encoder = backend.begin_encoding();
+
+
+            if new_len as u64 >= self.$capacity() {
+                // We need to grow the buffer.
+                let new_capacity = new_len.next_power_of_two();
+                // SAFETY: will be initialized by the buffer init.
+                let mut new_buffer = backend.$uninit_buffer(
+                    new_capacity as usize,
+                    self.buffer().usage() | BufferUsages::COPY_DST
+                )?;
+
+                encoder.$copy_buffer_to_buffer(
+                    &self.buffer,
+                    0,
+                    &mut new_buffer,
+                    0,
+                    curr_len as usize,
+                )?;
+                self.buffer = new_buffer;
+            }
+
+            backend.$write_buffer(&mut self.buffer, curr_len as u64, data)?;
+            backend.submit(encoder)?;
+            self.shape[dim_to_grow] = new_len;
+            Ok(())
+        }
+
+        /// Removes a `range` of elements from this tensor if it is a vector, shifting back elements to
+        /// fill the gap.
+        ///
+        /// Panics if the tensor isn’t a vector. The tensor is a vector if:
+        /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where
+        ///  `*` is any non-zero positive integer).
+        /// - It is a column-major tensor and its size is made of a single column. Its size is
+        ///   `[*, 1, 1, 1]` (where `*` is any non-zero positive integer).
+        ///
+        /// This method doesn’t change the tensor’s capacity so the internal GPU buffer isn’t resized.
+        ///
+        /// # Performance note
+        ///
+        /// This method is currently fairly expensive as it always involves the creation of a staging
+        /// buffer for copying the data being moved. The staging buffer size is equal to the number of
+        /// moved elements.
+        ///
+        /// # Panic
+        ///
+        /// Panics if `self` wasn’t created with the `BufferUsages::COPY_SRC | BufferUsages::COPY_DST` flags.
+        /// Panics if the range is out of the bounds of `self`.
+        ///
+        /// # Return
+        ///
+        /// If the operation suceeded, returns the number of removed elements.
+        // TODO: add a special case for targets capable of copying slices within the same buffer.
+        // TODO: it would be worth benchmarking with doing the shift with a compute shader instead.
+        pub fn $shift_remove(
+            &mut self,
+            backend: &B,
+            range: impl RangeBounds<usize>,
+        ) -> Result<usize, B::Error>
+        where T: $TraitBound {
+            let dim_to_shrink = self.vector_dim();
+            let curr_len = self.shape[dim_to_shrink] as usize;
+            let range_start = match range.start_bound() {
+                Bound::Included(i) => *i,
+                Bound::Excluded(i) => *i + 1,
+                Bound::Unbounded => 0,
+            };
+            let range_end = match range.end_bound() {
+                Bound::Included(i) => *i + 1,
+                Bound::Excluded(i) => *i,
+                Bound::Unbounded => curr_len,
+            };
+
+            if range_end <= range_start {
+                // The range to remove is empty.
+                return Ok(0);
+            }
+
+            assert!(range_end <= curr_len, "Range index out of bounds.");
+            let num_elements_to_move = curr_len - range_end;
+
+            // NOTE: if `curr_end == range_end` we don’t actually need to move any data, shrinking
+            //       the shape is sufficient.
+            if num_elements_to_move > 0 {
+                // SAFETY: will be initialized with a buffer-to-buffer copy.
+                let mut staging = backend.$uninit_buffer(
+                    num_elements_to_move,
+                    BufferUsages::STORAGE | BufferUsages::COPY_DST | BufferUsages::COPY_SRC,
+                )?;
+
+                let mut encoder = backend.begin_encoding();
+                encoder.$copy_buffer_to_buffer(
+                    &self.buffer,
+                    range_end,
+                    &mut staging,
+                    0,
+                    num_elements_to_move,
+                )?;
+                encoder.$copy_buffer_to_buffer(
+                    &staging,
+                    0,
+                    &mut self.buffer,
+                    range_start,
+                    num_elements_to_move,
+                )?;
+                backend.submit(encoder)?;
+            }
+
+            let num_removed = range_end - range_start;
+            self.shape[dim_to_shrink] -= num_removed as u32;
+            Ok(num_removed)
+        }
+    }
+);
+
+impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
+    append_and_remove!(append, shift_remove, Pod, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer);
+    append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased);
+}
\ No newline at end of file

From cd8e9300121460fc17af07cf05ca9842ee1e4a74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Wed, 8 Oct 2025 14:59:28 +0200
Subject: [PATCH 02/12] =?UTF-8?q?feat:=E2=80=AFadd=20diagonal=20and=20trac?=
 =?UTF-8?q?e=20functions=20for=20matrices?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 shaders/stensor/utils/mat.slang | 55 +++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/shaders/stensor/utils/mat.slang b/shaders/stensor/utils/mat.slang
index e69de29..f62ec56 100644
--- a/shaders/stensor/utils/mat.slang
+++ b/shaders/stensor/utils/mat.slang
@@ -0,0 +1,55 @@
+module mat;
+
+/*
+ * Trace of a matrix.
+ */
+
+/// The trace of a 2x2 matrix.
+public func trace(m: float2x2) -> float {
+    return m[0][0] + m[1][1];
+}
+
+/// The trace of a 3x3 matrix.
+public func trace(m: float3x3) -> float {
+    return m[0][0] + m[1][1] + m[2][2];
+}
+
+/// The trace of a 4x4 matrix.
+public func trace(m: float4x4) -> float {
+    return m[0][0] + m[1][1] + m[2][2] + m[3][3];
+}
+
+/*
+ * Diagonal extraction and diagonal matrix init.
+ */
+
+/// Initializes a diagonal 2x2 matrix.
+public func diag(d: float2) -> float2x2 {
+    return float2x2(float2(d.x, 0.0), float2(0.0, d.y));
+}
+
+/// Initializes a diagonal 3x3 matrix.
+public func diag(d: float3) -> float3x3 {
+    return float3x3(float3(d.x, 0.0, 0.0), float3(0.0, d.y, 0.0), float3(0.0, 0.0, d.z));
+}
+
+/// Initializes a diagonal 4x4 matrix.
+public func diag(d: float4) -> float4x4 {
+    return float4x4(float4(d.x, 0.0, 0.0, 0.0), float4(0.0, d.y, 0.0, 0.0), float4(0.0, 0.0, d.z, 0.0), float4(0.0, 0.0, 0.0, d.w));
+}
+
+
+/// Return the diagonal of a 2x2 matrix.
+public func diag(m: float2x2) -> float2 {
+    return float2(m[0][0], m[1][1]);
+}
+
+/// Return the diagonal of a 3x3 matrix.
+public func diag(m: float3x3) -> float3 {
+    return float3(m[0][0], m[1][1], m[2][2]);
+}
+
+/// Return the diagonal of a 4x4 matrix.
+public func diag(m: float4x4) -> float4 {
+    return float4(m[0][0], m[1][1], m[2][2], m[3][3]);
+}
\ No newline at end of file

From f00dd884297f3ea19e4e3e9c77ce7025b462ed0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Tue, 21 Oct 2025 10:19:59 +0200
Subject: [PATCH 03/12] =?UTF-8?q?feat:=E2=80=AFweaken=20trait=20requiremen?=
 =?UTF-8?q?ts=20for=20tensor=20read/init/writes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/tensor.rs | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/tensor.rs b/src/tensor.rs
index c486ef6..2812c6c 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -4,7 +4,7 @@
 // TODO: feels like this should be in stensor instead of slang-hal
 
 use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape};
-use bytemuck::Pod;
+use bytemuck::{NoUninit};
 use encase::ShaderType;
 use nalgebra::{Dim, IsContiguous, Matrix, Storage};
 use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding};
@@ -71,7 +71,7 @@ impl TensorBuilder {
     }
 
     /// Builds the uninitialized gpu tensor.
-    pub fn build_uninit<T: DeviceValue + Pod, B: Backend>(
+    pub fn build_uninit<T: DeviceValue + NoUninit, B: Backend>(
         self,
         backend: &B,
     ) -> Result<GpuTensor<T, B>, B::Error> {
@@ -127,7 +127,7 @@ impl TensorBuilder {
     // }
 
     /// Builds this tensor with an array of values given for its initial value.
-    pub fn build_init<T: DeviceValue + Pod, B: Backend>(
+    pub fn build_init<T: DeviceValue + NoUninit, B: Backend>(
         self,
         backend: &B,
         data: &[T],
@@ -212,10 +212,15 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         self.shape.into_iter().map(|s| s as u64).product()
     }
 
+    // /// The tensor’s rank.
+    // pub fn rank(&self) -> u64 {
+    //     self.shape.iter().filter(|i| **i != 1).count() as u64
+    // }
+
     /// The maximum number of elements this tensor can hold without needing a resize of the
     /// underlying GPU buffer.
     pub fn capacity(&self) -> u64
-    where T: Pod {
+    where T: NoUninit {
         self.buffer.len() as u64
     }
 
@@ -293,7 +298,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         source: impl Into<GpuTensorView<'a, T, B>>,
     ) -> Result<(), B::Error>
     where
-        T: DeviceValue + Pod,
+        T: DeviceValue + NoUninit,
     {
         let source = source.into();
         let copy_len = self.len();
@@ -517,7 +522,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
     ///
     /// If it matches, returns the tensor's matrix ordering.
     pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
-    where T: Pod {
+    where T: NoUninit {
         if self.buffer.len() == self.len() as usize && self.offset == 0 {
             self.is_contiguous()
         } else {
@@ -731,7 +736,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
     ///
     /// If it matches, returns the tensor's matrix ordering.
     pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
-    where T: Pod {
+    where T: NoUninit {
         self.as_ref().is_entire_tensor()
     }
 
@@ -902,13 +907,13 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     }
 }
 
-impl<T: DeviceValue + Pod, B: Backend> GpuTensor<T, B> {
+impl<T: DeviceValue + NoUninit, B: Backend> GpuTensor<T, B> {
     /// Allocates a new matrix on the gpu with uninitialized elements.
     ///
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn matrix_uninit(
+    pub fn matrix_uninit(
         backend: &B,
         nrows: u32,
         ncols: u32,
@@ -917,7 +922,7 @@ impl<T: DeviceValue + Pod, B: Backend> GpuTensor<T, B> {
     where
         T: DeviceValue,
     {
-        unsafe { TensorBuilder::matrix(nrows, ncols, usage).build_uninit(backend) }
+        TensorBuilder::matrix(nrows, ncols, usage).build_uninit(backend)
     }
 
     // pub fn uninit_encased(device: &Device, nrows: u32, ncols: u32, usage: BufferUsages) -> Self
@@ -980,21 +985,21 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         usage: BufferUsages,
     ) -> Result<Self, B::Error>
     where
-        T: DeviceValue + Pod,
+        T: DeviceValue + NoUninit,
     {
         TensorBuilder::vector(len, usage).build_uninit(backend)
     }
 
     /// Allocates a new vector on the gpu initialized from `vector`.
     ///
-    /// If `T` does not implement `Pod`, use [`GpuMatrix::encase`] instead.
+    /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead.
     pub fn vector(
         backend: &B,
         vector: impl AsRef<[T]>,
         usage: BufferUsages,
     ) -> Result<Self, B::Error>
     where
-        T: DeviceValue + Pod,
+        T: DeviceValue + NoUninit,
     {
         let v = vector.as_ref();
         TensorBuilder::vector(v.len() as u32, usage).build_init(backend, v.as_ref())
@@ -1017,7 +1022,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
 
     /// Allocates a new vector on the gpu initialized from `vector`.
     ///
-    /// If `T` does not implement `Pod`, use [`GpuMatrix::encase`] instead.
+    /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead.
     pub fn vector_encased(
         backend: &B,
         vector: impl AsRef<[T]>,
@@ -1037,11 +1042,11 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub unsafe fn scalar_uninit(backend: &B, usage: BufferUsages) -> Result<Self, B::Error>
+    pub fn scalar_uninit(backend: &B, usage: BufferUsages) -> Result<Self, B::Error>
     where
-        T: DeviceValue + Pod,
+        T: DeviceValue + NoUninit,
     {
-        unsafe { TensorBuilder::scalar(usage).build_uninit(backend) }
+        TensorBuilder::scalar(usage).build_uninit(backend)
     }
 
     /// Allocates a new gpu storage buffer with a single uninitialized element.
@@ -1059,7 +1064,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// Allocates a new gpu storage buffer with a single element initialized to `value`.
     pub fn scalar(backend: &B, value: T, usage: BufferUsages) -> Result<Self, B::Error>
     where
-        T: DeviceValue + Pod,
+        T: DeviceValue + NoUninit,
     {
         TensorBuilder::scalar(usage).build_init(backend, &[value])
     }
@@ -1228,6 +1233,6 @@ macro_rules! append_and_remove(
 );
 
 impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
-    append_and_remove!(append, shift_remove, Pod, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer);
+    append_and_remove!(append, shift_remove, NoUninit, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer);
     append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased);
 }
\ No newline at end of file

From c092544a852bf1a7a7fde58d8923401d101a3406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 16:00:29 +0100
Subject: [PATCH 04/12] feat: update to wgpu 27

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index db060ba..9ba15e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@ cuda = [ "cudarc", "slang-hal/cuda" ]
 cublas = [ "slang-hal/cublas" ]
 
 [dependencies]
-wgpu = "26"
+wgpu = "27"
 encase = "0.12"
 bytemuck = "1"
 nalgebra = { version = "0.34", features = ["encase"] }

From 682f79f235eeae1354ccc7960fa2488f67fd1bea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 17:46:31 +0100
Subject: [PATCH 05/12] =?UTF-8?q?chore:=E2=80=AFadd=20CI=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/ci.yaml           | 162 ++++++++++++++++++++++++++++
 .github/workflows/download_slang.sh | 109 +++++++++++++++++++
 2 files changed, 271 insertions(+)
 create mode 100644 .github/workflows/ci.yaml
 create mode 100755 .github/workflows/download_slang.sh

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..fae9c22
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,162 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+env:
+  CARGO_TERM_COLOR: always
+  RUSTFLAGS: --deny warnings
+  RUSTDOCFLAGS: --deny warnings
+  SLANG_TAG: 2025.18.2
+
+jobs:
+  # Check formatting.
+  format:
+    name: Format
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          components: rustfmt
+
+      - name: Run cargo fmt
+        run: cargo fmt --all -- --check
+  setup-slang:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    outputs:
+      slang-dir: ${{ steps.setup.outputs.slang-dir }} # Pass SLANG_DIR to dependent jobs
+      slang-cache-key: ${{ steps.setup.outputs.slang-cache-key }} # Pass SLANG_DIR to dependent jobs
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Cache Slang
+        id: cache-slang
+        uses: actions/cache/restore@v4 # Restore first
+        with:
+          path: |
+            ~/.cache/slang  # Matches script's default OUTPUT_DIR
+          key: slang-v$SLANG_TAG-${{ runner.os }}-${{ runner.arch }}
+
+      - name: Setup Slang
+        id: setup
+        run: |
+          echo "version=$SLANG_TAG" >> $GITHUB_OUTPUT  # Output for cache key
+          SLANG_DIR=$(./.github/workflows/download_slang.sh --version $SLANG_TAG | grep '^SLANG_DIR=' | cut -d'=' -f2-)
+          echo "slang-dir=$SLANG_DIR" >> $GITHUB_OUTPUT  # Output for dependents
+          echo "slang-cache-key=slang-v$SLANG_TAG-${{ runner.os }}-${{ runner.arch }}" >> $GITHUB_OUTPUT
+          echo "SLANG_DIR=$SLANG_DIR" >> $GITHUB_ENV  # For this job if needed
+
+      - name: Save Slang Cache
+        if: steps.cache-slang.outputs.cache-hit != 'true' # Only save on miss
+        uses: actions/cache/save@v4
+        with:
+          path: ~/.cache/slang
+          key: ${{ steps.setup.outputs.slang-cache-key }}
+  # Run clippy lints.
+  clippy:
+    needs: setup-slang # Depends on setup-slang
+    name: Clippy
+    runs-on: ubuntu-latest
+    env:
+      SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }}
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+        with:
+          components: clippy
+
+      - name: Install dependencies
+        run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev
+
+      - name: Retrieve Cache for Slang
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/slang
+          key: ${{ needs.setup-slang.outputs.slang-cache-key }}
+
+      - name: Populate target directory from cache
+        uses: Leafwing-Studios/cargo-cache@v2
+        with:
+          sweep-cache: true
+
+      - name: Run clippy lints
+        run: SLANG_DIR=$SLANG_DIR cargo clippy --locked --workspace --all-targets -- --deny warnings
+
+  # Check documentation.
+  doc:
+    needs: setup-slang # Depends on setup-slang
+    name: Docs
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    env:
+      SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install dependencies
+        run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev
+
+      - name: Retrieve Cache for Slang
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/slang
+          key: ${{ needs.setup-slang.outputs.slang-cache-key }}
+
+      - name: Populate target directory from cache
+        uses: Leafwing-Studios/cargo-cache@v2
+        with:
+          sweep-cache: true
+
+      - name: Check documentation
+        run: SLANG_DIR=$SLANG_DIR cargo doc --locked --workspace --document-private-items --no-deps
+    # Testing.
+  test:
+    needs: setup-slang # Depends on setup-slang
+    name: Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    env:
+      SLANG_DIR: ${{ needs.setup-slang.outputs.slang-dir }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install dependencies
+        run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev
+
+      - name: Retrieve Cache for Slang
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/slang
+          key: ${{ needs.setup-slang.outputs.slang-cache-key }}
+
+      - name: Populate target directory from cache
+        uses: Leafwing-Studios/cargo-cache@v2
+        with:
+          sweep-cache: true
+      - name: Run Cargo Tests
+        run: |
+          SLANG_DIR=$SLANG_DIR cargo test --verbose
\ No newline at end of file
diff --git a/.github/workflows/download_slang.sh b/.github/workflows/download_slang.sh
new file mode 100755
index 0000000..0d1738f
--- /dev/null
+++ b/.github/workflows/download_slang.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# Default values
+OS=""
+OUTPUT_DIR="$HOME/.cache/slang"
+SLANG_VERSION=""
+SLANG_TAG=""
+ASSET_SUFFIX=""
+SLANG_URL_BASE="https://github.com/shader-slang/slang/releases/download"
+
+# Help message
+usage() {
+    echo "Usage: $0 [--os <linux|macos|macos-arm64|windows>] [--output-dir <path>] [--version <version>]"
+    echo "  --os: Target OS (default: auto-detect from current platform)"
+    echo "  --output-dir: Directory to extract Slang (default: ~/.cache/slang)"
+    echo "  --version: Slang version (e.g., 2025.18.2, default: latest)"
+    echo "Example: $0 --os linux --output-dir /tmp/slang"
+}
+
+# Parse arguments
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --os) OS="$2"; shift ;;
+        --output-dir) export OUTPUT_DIR="$2"; shift ;;
+        --version) export SLANG_VERSION="$2"; shift ;;
+        *) usage ; exit 1 ;;
+    esac
+    shift
+done
+
+# Detect OS if not specified
+if [[ -z "$OS" ]]; then
+    case "$(uname -s)" in
+        Linux*) OS="linux" ;;
+        Darwin*)
+            if [[ "$(uname -m)" == "arm64" ]]; then
+                OS="macos-aarch64"
+            else
+                OS="macos"
+            fi
+            ;;
+        CYGWIN*|MINGW*|MSYS*) OS="windows" ;;
+        *) echo "Error: Unable to detect OS. Specify --os (linux, macos, macos-arm64, windows)"; exit 1 ;;
+    esac
+fi
+
+# Determine asset suffix based on OS
+case "$OS" in
+    linux) ASSET_SUFFIX="linux-x86_64.zip" ;;
+    macos) ASSET_SUFFIX="macos-x86_64.zip" ;;
+    macos-aarch64) ASSET_SUFFIX="macos-aarch64.zip" ;;
+    windows) ASSET_SUFFIX="windows-x86_64.zip" ;;
+    *) echo "Error: Unsupported OS: $OS"; exit 1 ;;
+esac
+
+# Get Slang version if not specified
+if [[ -z "$SLANG_VERSION" ]]; then
+    export SLANG_TAG=$(curl -s https://api.github.com/repos/shader-slang/slang/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
+    export SLANG_VERSION=$(echo "$SLANG_TAG" | sed 's/v//')  # e.g., v2025.18.2 -> 2025.18.2
+else
+    export SLANG_TAG="v$SLANG_VERSION"
+fi
+
+if [[ -z "$SLANG_VERSION" ]]; then
+    echo "Error: Could not determine Slang version"
+    exit 1
+fi
+
+# Set up paths
+SLANG_DIR="$OUTPUT_DIR/slang-v$SLANG_VERSION-$OS"
+ZIP_URL="$SLANG_URL_BASE/$SLANG_TAG/slang-$SLANG_VERSION-$ASSET_SUFFIX"
+TEMP_ZIP="/tmp/slang-$SLANG_VERSION.zip"
+
+# Check if Slang is already extracted
+if [[ -d "$SLANG_DIR" ]] && [[ -f "$SLANG_DIR/bin/slangc" || -f "$SLANG_DIR/bin/slangc.exe" ]]; then
+    echo "Using existing Slang at $SLANG_DIR"
+    echo "SLANG_DIR=$SLANG_DIR"
+    exit 0
+fi
+
+# Download Slang release
+echo "Downloading Slang v$SLANG_VERSION for $OS from $ZIP_URL..."
+mkdir -p "$OUTPUT_DIR"
+curl -L -o "$TEMP_ZIP" "$ZIP_URL" || { echo "Error: Download failed for $ZIP_URL"; exit 1; }
+
+# Extract based on OS
+echo "Extracting to $SLANG_DIR..."
+if [[ "$OS" == "windows" ]]; then
+    # Windows: Assume 7z is available (or adjust for PowerShell/Expand-Archive)
+    7z x "$TEMP_ZIP" -o"$SLANG_DIR" -y > /dev/null || { echo "Error: Extraction failed"; rm -f "$TEMP_ZIP"; exit 1; }
+else
+    # Linux/macOS: Use unzip
+    unzip -q "$TEMP_ZIP" -d "$SLANG_DIR" || { echo "Error: Extraction failed"; rm -f "$TEMP_ZIP"; exit 1; }
+fi
+
+# Clean up
+rm -f "$TEMP_ZIP"
+
+# Verify extraction
+if [[ ! -f "$SLANG_DIR/bin/slangc" && ! -f "$SLANG_DIR/bin/slangc.exe" ]]; then
+    echo "Error: Extraction incomplete, slangc not found in $SLANG_DIR/bin"
+    exit 1
+fi
+
+echo "Slang v$SLANG_VERSION extracted to $SLANG_DIR"
+echo "SLANG_DIR=$SLANG_DIR"
+
+# For use in calling script
+export SLANG_DIR
\ No newline at end of file

From 9fbe21007b7e0f73ea8405a44508405348755c4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 17:46:44 +0100
Subject: [PATCH 06/12] feat: update to slang-hal/minislang v0.2

---
 Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9ba15e8..d8a02d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,8 +19,8 @@ nalgebra = { version = "0.34", features = ["encase"] }
 
 cudarc = {  version = "0.16", optional = true }
 
-minislang = "0.1"
-slang-hal = { version = "0.1", features = ["derive"] }
+minislang = "0.2"
+slang-hal = { version = "0.2", features = ["derive"] }
 include_dir = "0.7"
 
 [dev-dependencies]

From 3f8bdc2892633589139c7ffb6e95e3e0d923c387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 17:47:38 +0100
Subject: [PATCH 07/12] fead: fix svd3 for near-identity matrices

---
 shaders/stensor/geometry/svd3.slang | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/shaders/stensor/geometry/svd3.slang b/shaders/stensor/geometry/svd3.slang
index 463eac9..fcce4c7 100644
--- a/shaders/stensor/geometry/svd3.slang
+++ b/shaders/stensor/geometry/svd3.slang
@@ -49,8 +49,10 @@ public struct Svd3 {
 
 // Constants used for calculation of givens quaternions
 static const float GAMMA = 5.828427124; // sqrt(8)+3;
-static const float CSTAR = 0.923879532; // cos(pi/8)
-static const float SSTAR = 0.3826834323; // sin(p/8)
+static const float CSTAR = 1.0; // TODO: using no-identity values (below) breaks the SVD for near-identity matrices.
+static const float SSTAR = 0.0; // TODO: using no-identity values (below) breaks the SVD for near-identity matrices.
+//static const float CSTAR = 0.923879532; // cos(pi/8)
+//static const float SSTAR = 0.3826834323; // sin(p/8)
 // Threshold value
 static const float SVD_EPSILON = 1e-6;
 // Iteration counts for Jacobi Eigenanalysis and reciprocal square root functions, influence precision

From cc9253ed2013f2f4d70e1c5021d30458dd41f95f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 17:54:58 +0100
Subject: [PATCH 08/12] Release v0.2.0

---
 CHANGELOG.md  |  4 ++++
 Cargo.toml    |  2 +-
 README.md     |  2 +-
 src/shapes.rs | 21 ++++++++++++++++++++-
 src/tensor.rs | 52 +++++++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e69de29..31dd2fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -0,0 +1,4 @@
+# v0.2.0 (27 Oct. 2025)
+- Update to slang-hal 0.2.
+- Make rank-1 tensors resizeable.
+- Fix svd3.slang retuning incorrect results for near-identity matrices.
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index d8a02d8..a9ace03 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ name = "stensor"
 authors = ["Sébastien Crozet <sebcrozet@dimforge.com>"]
 description = "Cross-platform GPU tensor library with Slang and Rust."
 repository = "https://github.com/dimforge/stensor"
-version = "0.1.1"
+version = "0.2.0"
 edition = "2024"
 license = "Apache-2.0"
 
diff --git a/README.md b/README.md
index fae9cf6..fcff46d 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ gpu". It aims (but it isn’t there yet) to expose linear algebra operations (in
 operations) as well as geometric types (quaternions, similarities, etc.) as Slang shaders and kernels.
 
 > **Warning**
-**stensor** is still very incomplete and under heavy development and is lacking many features.
+> **stensor** is still very incomplete and under heavy development and is lacking many features.
 
 See also the README of [slang-hal](https://github.com/dimforge/slang-hal/blob/main/README.md) for information on
 supported platforms.
diff --git a/src/shapes.rs b/src/shapes.rs
index 47619ef..4e56e37 100644
--- a/src/shapes.rs
+++ b/src/shapes.rs
@@ -6,19 +6,25 @@ use std::collections::hash_map::Entry;
 use std::sync::Mutex;
 use wgpu::BufferUsages;
 
+/// GGML dimension index mapping: converts between GGML and stensor dimension ordering.
 pub const GGML_IDS: [usize; 4] = [1, 0, 2, 3];
+/// GGML dimension index mapping (u32 version).
 pub const GGML_IDS_U32: [u32; 4] = [1, 0, 2, 3];
 
+/// Specifies the memory layout of matrices.
 #[derive(Copy, Clone, PartialEq, Eq, Default, Debug, Hash)]
 pub enum MatrixOrdering {
+    /// Column-major ordering: elements in the same column are contiguous in memory.
     #[default]
     ColumnMajor,
+    /// Row-major ordering: elements in the same row are contiguous in memory.
     RowMajor,
     // TODO: should we generalize this to a `MajorAxis(i)` where any
     //       dimension of the tensor can be interpreted as the main one?
 }
 
 impl MatrixOrdering {
+    /// Returns the transposed matrix ordering.
     pub fn transpose(self) -> Self {
         match self {
             Self::ColumnMajor => Self::RowMajor,
@@ -40,6 +46,7 @@ pub struct ViewShape {
 }
 
 impl ViewShape {
+    /// Creates a contiguous view shape with the given size and ordering.
     pub fn contiguous(size: [u32; 4], ordering: MatrixOrdering) -> Self {
         let stride = match ordering {
             MatrixOrdering::ColumnMajor => {
@@ -52,19 +59,23 @@ impl ViewShape {
         Self { size, stride }
     }
 
+    /// Returns a transposed view of this shape.
     pub fn transpose(&self) -> Self {
         self.permute([1, 0, 2, 3])
     }
 
+    /// Conditionally transposes the shape based on the `transpose` parameter.
     pub fn maybe_transpose(&self, transpose: bool) -> Self {
         if transpose { self.transpose() } else { *self }
     }
 
+    /// Permutes the dimensions according to GGML's dimension ordering convention.
     pub fn permute_ggml(&self, mut permutations: [usize; 4]) -> Self {
         permutations.swap(0, 1);
         self.permute(permutations.map(|i| GGML_IDS[i]))
     }
 
+    /// Permutes the dimensions according to the given permutation array.
     pub fn permute(&self, permutations: [usize; 4]) -> Self {
         // Check all the permutation indices are valid and without
         // duplicate.
@@ -138,9 +149,10 @@ impl ViewShape {
         }
     }
 
+    /// Checks if each dimension of this shape is a multiple of the corresponding dimension in `of`.
     pub fn is_multiple_of(&self, of: Self) -> bool {
         for k in 0..4 {
-            if self.size[k] % of.size[k] != 0 {
+            if !self.size[k].is_multiple_of(of.size[k]) {
                 return false;
             }
         }
@@ -148,6 +160,7 @@ impl ViewShape {
         true
     }
 
+    /// Creates a view with the specified shape and strides within this shape.
     pub fn view<const DIM2: usize>(&self, shape: [u32; DIM2], stride: [Option<u32>; DIM2]) -> Self {
         assert!(DIM2 <= 4);
 
@@ -225,10 +238,12 @@ impl ViewShape {
         }
     }
 
+    /// Checks if this shape contains zero elements.
     pub fn is_empty(&self) -> bool {
         self.len() == 0
     }
 
+    /// Returns the total number of elements in this shape.
     pub fn len(&self) -> u64 {
         (self.size[0] * self.size[1] * self.size[2] * self.size[3]) as u64
     }
@@ -256,11 +271,13 @@ impl<B: Backend> ViewShapeBuffers<B> {
         }
     }
 
+    /// Clears temporary shape buffers and recycles them for reuse.
     pub fn clear_tmp(&mut self) {
         let mut recycled = self.recycled.lock().unwrap();
         recycled.extend(self.tmp_buffers.drain().map(|(_, buf)| buf));
     }
 
+    /// Stores a temporary shape buffer for the given shape, creating one if needed.
     pub fn put_tmp(&mut self, backend: &B, shape: ViewShape) -> Result<(), B::Error> {
         if self.contains(shape) {
             return Ok(());
@@ -293,10 +310,12 @@ impl<B: Backend> ViewShapeBuffers<B> {
         backend.init_buffer(&[shape], usage | BufferUsages::STORAGE)
     }
 
+    /// Checks if a buffer for the given shape exists (permanent or temporary).
     pub fn contains(&self, shape: ViewShape) -> bool {
         self.buffers.contains_key(&shape) || self.tmp_buffers.contains_key(&shape)
     }
 
+    /// Inserts or retrieves a mutable buffer for the given shape.
     pub fn insert(
         &mut self,
         backend: &B,
diff --git a/src/tensor.rs b/src/tensor.rs
index 2812c6c..88ccad2 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -59,6 +59,7 @@ impl TensorBuilder {
         self.shape.into_iter().map(|s| s as u64).product()
     }
 
+    /// Sets the matrix ordering for this tensor.
     pub fn ordering(mut self, ordering: MatrixOrdering) -> Self {
         self.ordering = ordering;
         self
@@ -148,6 +149,7 @@ impl TensorBuilder {
         })
     }
 
+    /// Builds this tensor with an array of encase-encoded values given for its initial value.
     pub fn build_encased<T: DeviceValue + EncaseType, B: Backend>(
         self,
         backend: &B,
@@ -170,8 +172,11 @@ impl TensorBuilder {
     }
 }
 
+/// Type alias for a vector stored on the GPU.
 pub type GpuVector<T, B> = GpuTensor<T, B>;
+/// Type alias for a matrix stored on the GPU.
 pub type GpuMatrix<T, B> = GpuTensor<T, B>;
+/// Type alias for a scalar stored on the GPU.
 pub type GpuScalar<T, B> = GpuTensor<T, B>;
 
 /// A tensor stored in the GPU.
@@ -183,20 +188,24 @@ pub struct GpuTensor<T: DeviceValue, B: Backend> {
     ordering: MatrixOrdering,
 }
 
+/// Type alias for a tensor stored on the WebGPU backend.
 pub type WgpuTensor<T> = GpuTensor<T, WebGpu>;
 #[cfg(feature = "cuda")]
 pub type CudaTensor<T> = GpuTensor<T, Cuda>;
 
 impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
+    /// Returns the matrix ordering of this tensor.
     pub fn ordering(&self) -> MatrixOrdering {
         self.ordering
     }
 
+    /// Returns a transposed version of this tensor.
     pub fn transposed(mut self) -> Self {
         self.transpose();
         self
     }
 
+    /// Transposes this tensor in place.
     pub fn transpose(&mut self) {
         self.shape.swap(0, 1);
         self.ordering = self.ordering.transpose();
@@ -575,10 +584,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         self.view_shape.stride[GGML_IDS[i]]
     }
 
+    /// Returns a transposed view of this tensor.
     pub fn transposed(&self) -> Self {
         self.permute([1, 0, 2, 3])
     }
 
+    /// Permutes the dimensions of this view according to the given permutation array.
     pub fn permute(&self, permutations: [usize; 4]) -> Self {
         Self {
             view_shape: self.view_shape.permute(permutations),
@@ -587,6 +598,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
+    /// Permutes the dimensions according to GGML's dimension ordering convention.
     pub fn permute_ggml(&self, permutations: [usize; 4]) -> Self {
         Self {
             view_shape: self.view_shape.permute_ggml(permutations),
@@ -595,7 +607,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
-    // Specify the ordering explicitly to avoid ambiguities if the original shape has 1 row and 1 col.
+    /// Reshapes this view with an explicit ordering to avoid ambiguities.
+    ///
+    /// This is useful when the original shape has 1 row and 1 column.
     pub fn reshape_with_ordering<const DIM2: usize>(
         &self,
         shape: [u32; DIM2],
@@ -608,10 +622,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         self.view(0, shape4, view_shape.stride.map(Some))
     }
 
+    /// Reshapes this view to the specified shape, preserving the matrix ordering.
     pub fn reshape<const DIM2: usize>(&self, shape: [u32; DIM2]) -> Self {
         self.view(0, shape, [None; DIM2])
     }
 
+    /// Reshapes this view using GGML's dimension ordering convention.
     pub fn reshape_ggml<const DIM2: usize>(&self, mut shape: [u32; DIM2]) -> Self {
         shape.swap(0, 1);
 
@@ -623,7 +639,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
-    // Specify the ordering explicitly to avoid ambiguities if the original shape has 1 row and 1 col.
+    /// Reshapes this view using GGML's ordering with an explicit matrix ordering.
+    ///
+    /// This is useful to avoid ambiguities when the original shape has 1 row and 1 column.
     pub fn reshape_ggml_with_ordering<const DIM2: usize>(
         &self,
         mut shape: [u32; DIM2],
@@ -633,6 +651,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         self.reshape_with_ordering(shape, ordering)
     }
 
+    /// Creates a view of a sub-tensor with the specified offset, shape, and optional strides.
     pub fn view<const DIM2: usize>(
         &self,
         mut offset: u32,
@@ -656,6 +675,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
+    /// Creates a view using GGML's dimension ordering convention.
     pub fn view_ggml<const DIM2: usize>(
         &self,
         offset: u32,
@@ -667,6 +687,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         self.view(offset, shape, stride)
     }
 
+    /// Returns a view of the `matrix_id`-th matrix in this tensor.
     pub fn matrix(&self, matrix_id: u32) -> Self {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(matrix_id < nmats);
@@ -681,6 +702,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
+    /// Returns a view containing `new_ncols` columns starting from `first_col`.
     pub fn columns(&self, first_col: u32, new_ncols: u32) -> Self {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(first_col + new_ncols < ncols);
@@ -694,10 +716,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
+    /// Returns a view of the specified column.
     pub fn column(&self, col: u32) -> Self {
         self.columns(col, 1)
     }
 
+    /// Returns a view containing `new_nrows` rows starting from `first_row`.
     pub fn rows(&self, first_row: u32, new_nrows: u32) -> Self {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(first_row + new_nrows < nrows);
@@ -711,12 +735,14 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
         }
     }
 
+    /// Returns a view of the specified row.
     pub fn row(&self, row: u32) -> Self {
         self.rows(row, 1)
     }
 }
 
 impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
+    /// Converts this mutable view into an immutable view.
     pub fn as_ref(&self) -> GpuTensorView<'_, T, B> {
         GpuTensorView {
             view_shape: self.view_shape,
@@ -765,10 +791,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         self.view_shape.len()
     }
 
+    /// Returns a transposed mutable view of this tensor.
     pub fn transposed(&mut self) -> GpuTensorViewMut<'_, T, B> {
         self.permute([1, 0, 2, 3])
     }
 
+    /// Permutes the dimensions of this mutable view according to the given permutation array.
     pub fn permute(&mut self, permutations: [usize; 4]) -> GpuTensorViewMut<'_, T, B> {
         GpuTensorViewMut {
             view_shape: self.view_shape.permute(permutations),
@@ -777,10 +805,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         }
     }
 
+    /// Reshapes this mutable view to the specified shape.
     pub fn reshape<const DIM2: usize>(&mut self, shape: [u32; DIM2]) -> GpuTensorViewMut<'_, T, B> {
         self.view(0, shape, [None; DIM2])
     }
 
+    /// Creates a mutable view of a sub-tensor with the specified offset, shape, and optional strides.
     pub fn view<const DIM2: usize>(
         &mut self,
         mut offset: u32,
@@ -804,6 +834,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         }
     }
 
+    /// Returns a mutable view of the `matrix_id`-th matrix in this tensor.
     pub fn matrix(&mut self, matrix_id: u32) -> GpuTensorViewMut<'_, T, B> {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(matrix_id < nmats);
@@ -818,6 +849,7 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         }
     }
 
+    /// Returns a mutable view containing `new_ncols` columns starting from `first_col`.
     pub fn columns(&mut self, first_col: u32, new_ncols: u32) -> GpuTensorViewMut<'_, T, B> {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(first_col + new_ncols < ncols);
@@ -831,10 +863,12 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         }
     }
 
+    /// Returns a mutable view of the specified column.
     pub fn column(&mut self, col: u32) -> GpuTensorViewMut<'_, T, B> {
         self.columns(col, 1)
     }
 
+    /// Returns a mutable view containing `new_nrows` rows starting from `first_row`.
     pub fn rows(&mut self, first_row: u32, new_nrows: u32) -> GpuTensorViewMut<'_, T, B> {
         let [nrows, ncols, nmats, ncubes] = self.view_shape.size;
         assert!(first_row + new_nrows < nrows);
@@ -848,29 +882,35 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
         }
     }
 
+    /// Returns a mutable view of the specified row.
     pub fn row(&mut self, row: u32) -> GpuTensorViewMut<'_, T, B> {
         self.rows(row, 1)
     }
 }
 
 impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
+    /// Reshapes this tensor to the specified shape.
     pub fn reshape<const DIM2: usize>(&self, shape: [u32; DIM2]) -> GpuTensorView<'_, T, B> {
         self.as_view().reshape_with_ordering(shape, self.ordering)
     }
 
+    /// Reshapes this tensor using GGML's dimension ordering convention.
     pub fn reshape_ggml<const DIM2: usize>(&self, shape: [u32; DIM2]) -> GpuTensorView<'_, T, B> {
         self.as_view()
             .reshape_ggml_with_ordering(shape, self.ordering)
     }
 
+    /// Permutes the dimensions of this tensor according to the given permutation array.
     pub fn permute(&self, permutations: [usize; 4]) -> GpuTensorView<'_, T, B> {
         self.as_view().permute(permutations)
     }
 
+    /// Permutes the dimensions according to GGML's dimension ordering convention.
     pub fn permute_ggml(&self, permutations: [usize; 4]) -> GpuTensorView<'_, T, B> {
         self.as_view().permute_ggml(permutations)
     }
 
+    /// Creates a view of a sub-tensor with the specified offset, shape, and optional strides.
     pub fn view<const DIM2: usize>(
         &self,
         offset: u32,
@@ -880,6 +920,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         self.as_view().view(offset, shape, stride)
     }
 
+    /// Creates a view using GGML's dimension ordering convention.
     pub fn view_ggml<const DIM2: usize>(
         &self,
         offset: u32,
@@ -894,14 +935,17 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
         self.as_view().column(i)
     }
 
+    /// Returns a view containing `ncols` columns starting from `first_col`.
     pub fn columns(&self, first_col: u32, ncols: u32) -> GpuTensorView<'_, T, B> {
         self.as_view().columns(first_col, ncols)
     }
 
+    /// Returns a view of the specified row.
     pub fn row(&self, i: u32) -> GpuTensorView<'_, T, B> {
         self.as_view().row(i)
     }
 
+    /// Returns a view containing `nrows` rows starting from `first_row`.
     pub fn rows(&self, first_row: u32, nrows: u32) -> GpuTensorView<'_, T, B> {
         self.as_view().rows(first_row, nrows)
     }
@@ -1099,7 +1143,7 @@ macro_rules! append_and_remove(
         ///
         /// Panics if the tensor isn’t a vector. The tensor is a vector if:
         /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where
-        ///  `*` is any non-zero positive integer).
+        ///   `*` is any non-zero positive integer).
         /// - It is a column-major tensor and its size is made of a single column. Its size is
         ///   `[*, 1, 1, 1]` (where `*` is any non-zero positive integer).
         ///
@@ -1149,7 +1193,7 @@ macro_rules! append_and_remove(
         ///
         /// Panics if the tensor isn’t a vector. The tensor is a vector if:
         /// - It is a row-major tensor and is made of a single row. Its size is `[1, *, 1, 1]` (where
-        ///  `*` is any non-zero positive integer).
+        ///   `*` is any non-zero positive integer).
         /// - It is a column-major tensor and its size is made of a single column. Its size is
         ///   `[*, 1, 1, 1]` (where `*` is any non-zero positive integer).
         ///

From ce3935732c7c103fbcaa8155433bede21c72f3ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 17:55:55 +0100
Subject: [PATCH 09/12] chore: cargo fmt

---
 examples/gemm_bench.rs |  4 ++--
 src/tensor.rs          | 47 +++++++++++++++++++++++++++++-------------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/examples/gemm_bench.rs b/examples/gemm_bench.rs
index 12954bc..2ca5225 100644
--- a/examples/gemm_bench.rs
+++ b/examples/gemm_bench.rs
@@ -5,9 +5,9 @@ use nalgebra::DMatrix;
 use slang_hal::Shader;
 use slang_hal::backend::WebGpu;
 use slang_hal::backend::{Backend, Encoder};
-use slang_hal::shapes::ViewShapeBuffers;
-use slang_hal::tensor::{GpuTensor, TensorBuilder};
 use stensor::linalg::{Gemm, GemmVariant};
+use stensor::shapes::ViewShapeBuffers;
+use stensor::tensor::GpuTensor;
 use wgpu::{BufferUsages, Features, Limits};
 
 #[async_std::main]
diff --git a/src/tensor.rs b/src/tensor.rs
index 88ccad2..22e51ce 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -4,7 +4,7 @@
 // TODO: feels like this should be in stensor instead of slang-hal
 
 use crate::shapes::{GGML_IDS, MatrixOrdering, ViewShape};
-use bytemuck::{NoUninit};
+use bytemuck::NoUninit;
 use encase::ShaderType;
 use nalgebra::{Dim, IsContiguous, Matrix, Storage};
 use slang_hal::backend::{Backend, Buffer, DeviceValue, EncaseType, Encoder, ShaderBinding};
@@ -229,14 +229,18 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// The maximum number of elements this tensor can hold without needing a resize of the
     /// underlying GPU buffer.
     pub fn capacity(&self) -> u64
-    where T: NoUninit {
+    where
+        T: NoUninit,
+    {
         self.buffer.len() as u64
     }
 
     /// The maximum number of elements this tensor can hold without needing a resize of the
     /// underlying GPU buffer.
     pub fn capacity_encased(&self) -> u64
-    where T: EncaseType {
+    where
+        T: EncaseType,
+    {
         self.buffer.len_encased() as u64
     }
 
@@ -531,7 +535,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorView<'a, T, B> {
     ///
     /// If it matches, returns the tensor's matrix ordering.
     pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
-    where T: NoUninit {
+    where
+        T: NoUninit,
+    {
         if self.buffer.len() == self.len() as usize && self.offset == 0 {
             self.is_contiguous()
         } else {
@@ -762,7 +768,9 @@ impl<'a, T: DeviceValue, B: Backend> GpuTensorViewMut<'a, T, B> {
     ///
     /// If it matches, returns the tensor's matrix ordering.
     pub fn is_entire_tensor(&self) -> Option<MatrixOrdering>
-    where T: NoUninit {
+    where
+        T: NoUninit,
+    {
         self.as_ref().is_entire_tensor()
     }
 
@@ -1023,11 +1031,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     /// # Safety
     ///
     /// The returned buffer must be initialized before being read from.
-    pub fn vector_uninit(
-        backend: &B,
-        len: u32,
-        usage: BufferUsages,
-    ) -> Result<Self, B::Error>
+    pub fn vector_uninit(backend: &B, len: u32, usage: BufferUsages) -> Result<Self, B::Error>
     where
         T: DeviceValue + NoUninit,
     {
@@ -1136,7 +1140,6 @@ impl<'b, B: Backend, T: DeviceValue> ShaderArgs<'b, B> for GpuTensor<T, B> {
     }
 }
 
-
 macro_rules! append_and_remove(
     ($append: ident, $shift_remove: ident, $TraitBound: ident, $capacity: ident, $copy_buffer_to_buffer: ident, $uninit_buffer: ident, $write_buffer: ident) => {
         /// Append the `data` elements at the end of this tensor if it is a vector.
@@ -1277,6 +1280,22 @@ macro_rules! append_and_remove(
 );
 
 impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
-    append_and_remove!(append, shift_remove, NoUninit, capacity, copy_buffer_to_buffer, uninit_buffer, write_buffer);
-    append_and_remove!(append_encased, shift_remove_encased, EncaseType, capacity_encased, copy_buffer_to_buffer_encased, uninit_buffer_encased, write_buffer_encased);
-}
\ No newline at end of file
+    append_and_remove!(
+        append,
+        shift_remove,
+        NoUninit,
+        capacity,
+        copy_buffer_to_buffer,
+        uninit_buffer,
+        write_buffer
+    );
+    append_and_remove!(
+        append_encased,
+        shift_remove_encased,
+        EncaseType,
+        capacity_encased,
+        copy_buffer_to_buffer_encased,
+        uninit_buffer_encased,
+        write_buffer_encased
+    );
+}

From ad4dce8c7a846d918e541c41f85703776132bfe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 18:02:26 +0100
Subject: [PATCH 10/12] fix clippy & docs

---
 README.md                | 2 +-
 examples/gemm_bench.rs   | 3 +--
 src/linalg/contiguous.rs | 3 ++-
 src/linalg/gemm.rs       | 3 ++-
 src/linalg/gemv.rs       | 3 ++-
 src/linalg/op_assign.rs  | 3 ++-
 src/linalg/reduce.rs     | 3 ++-
 src/tensor.rs            | 4 +---
 8 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index fcff46d..b06e227 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ supported platforms.
 ### Using Slang
 
 In order to compile and run any slang project, be sure to define the `SLANG_DIR` environment variable:
-1. Download the Slang compiler libraries for your platform: https://github.com/shader-slang/slang/releases/tag/v2025.16
+1. Download the Slang compiler libraries for your platform: <https://github.com/shader-slang/slang/releases/tag/v2025.16>
 2. Unzip the downloaded directory, and use its path as value to the `SLANG_DIR` environment variable: `SLANG_DIR=/path/to/slang`.
    Note that the variable must point to the root of the slang installation (i.e. the directory that contains `bin` and `lib`).
    We recommend adding that as a system-wide environment variables so that it also becomes available to your IDE.
\ No newline at end of file
diff --git a/examples/gemm_bench.rs b/examples/gemm_bench.rs
index 2ca5225..b7cbd46 100644
--- a/examples/gemm_bench.rs
+++ b/examples/gemm_bench.rs
@@ -1,4 +1,3 @@
-use approx::assert_relative_eq;
 use indexmap::IndexMap;
 use minislang::SlangCompiler;
 use nalgebra::DMatrix;
@@ -105,7 +104,7 @@ async fn run_gemm<B: Backend>(
         drop(pass); // Ensure the pass is ended before the encoder is borrowed again.
 
         backend.submit(encoder)?;
-        backend.synchronize();
+        backend.synchronize()?;
         timing[i] = t0.elapsed().as_secs_f32();
         backend
             .slow_read_buffer(result.buffer(), gpu_result.as_mut_slice())
diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs
index d3bafaa..88d3c84 100644
--- a/src/linalg/contiguous.rs
+++ b/src/linalg/contiguous.rs
@@ -85,7 +85,8 @@ mod test {
     }
 
     async fn gpu_contiguous_generic(backend: impl Backend) {
-        let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]);
+        let mut compiler = SlangCompiler::new(vec![]);
+        crate::register_shaders(&mut compiler);
         let contiguous = super::Contiguous::from_backend(&backend, &compiler).unwrap();
 
         let mut shapes = ViewShapeBuffers::new(&backend);
diff --git a/src/linalg/gemm.rs b/src/linalg/gemm.rs
index fa2f2b5..4add4e4 100644
--- a/src/linalg/gemm.rs
+++ b/src/linalg/gemm.rs
@@ -237,7 +237,8 @@ mod test {
     }
 
     async fn gpu_gemm_generic(backend: impl Backend) {
-        let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]);
+        let mut compiler = SlangCompiler::new(vec![]);
+        crate::register_shaders(&mut compiler);
         let gemm = super::Gemm::from_backend(&backend, &compiler).unwrap();
 
         let mut shapes = ViewShapeBuffers::new(&backend);
diff --git a/src/linalg/gemv.rs b/src/linalg/gemv.rs
index bc4df73..4705782 100644
--- a/src/linalg/gemv.rs
+++ b/src/linalg/gemv.rs
@@ -349,7 +349,8 @@ mod test {
     }
 
     async fn gpu_gemv_generic(backend: impl Backend) {
-        let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]);
+        let mut compiler = SlangCompiler::new(vec![]);
+        crate::register_shaders(&mut compiler);
         let gemv = super::Gemv::from_backend(&backend, &compiler).unwrap();
 
         let mut shapes = ViewShapeBuffers::new(&backend);
diff --git a/src/linalg/op_assign.rs b/src/linalg/op_assign.rs
index 97cb00d..55f0db2 100644
--- a/src/linalg/op_assign.rs
+++ b/src/linalg/op_assign.rs
@@ -187,7 +187,8 @@ mod test {
             OpAssignVariant::Div,
             OpAssignVariant::Copy,
         ];
-        let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]);
+        let mut compiler = SlangCompiler::new(vec![]);
+        crate::register_shaders(&mut compiler);
 
         let op_assign = super::OpAssign::from_backend(&backend, &compiler).unwrap();
 
diff --git a/src/linalg/reduce.rs b/src/linalg/reduce.rs
index e39de6a..ae5b908 100644
--- a/src/linalg/reduce.rs
+++ b/src/linalg/reduce.rs
@@ -93,7 +93,8 @@ mod test {
             ReduceVariant::Prod,
             ReduceVariant::SqNorm,
         ];
-        let compiler = SlangCompiler::new(vec!["../../crates/stensor/shaders".into()]);
+        let mut compiler = SlangCompiler::new(vec![]);
+        crate::register_shaders(&mut compiler);
 
         let reduce = super::Reduce::from_backend(&backend, &compiler).unwrap();
 
diff --git a/src/tensor.rs b/src/tensor.rs
index 22e51ce..c94262c 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -1040,7 +1040,7 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
 
     /// Allocates a new vector on the gpu initialized from `vector`.
     ///
-    /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead.
+    /// If `T` does not implement `NoUninit`, use [`GpuTensor::vector_encased`] instead.
     pub fn vector(
         backend: &B,
         vector: impl AsRef<[T]>,
@@ -1069,8 +1069,6 @@ impl<T: DeviceValue, B: Backend> GpuTensor<T, B> {
     }
 
     /// Allocates a new vector on the gpu initialized from `vector`.
-    ///
-    /// If `T` does not implement `NoUninit`, use [`GpuMatrix::encase`] instead.
     pub fn vector_encased(
         backend: &B,
         vector: impl AsRef<[T]>,

From 30c25eeb6a9f40fa39f07f3506572e2c8eacb5d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 18:06:34 +0100
Subject: [PATCH 11/12] fix tests

---
 shaders/stensor/linalg/gemm.slang | 2 --
 src/linalg/contiguous.rs          | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/shaders/stensor/linalg/gemm.slang b/shaders/stensor/linalg/gemm.slang
index a69c92c..315fbd4 100644
--- a/shaders/stensor/linalg/gemm.slang
+++ b/shaders/stensor/linalg/gemm.slang
@@ -26,8 +26,6 @@ void gemm_fast(
 ) {
     let local_id = local_id.y;
 
-    out[0] = 1.0;
-
     for (var k = 0u; k < shape_m2.ncols; k += 4u) {
         var sum = float4x4(0.0);
 
diff --git a/src/linalg/contiguous.rs b/src/linalg/contiguous.rs
index 88d3c84..3cd66bc 100644
--- a/src/linalg/contiguous.rs
+++ b/src/linalg/contiguous.rs
@@ -112,8 +112,8 @@ mod test {
                 &backend,
                 &mut shapes,
                 &mut pass,
-                gpu_tensor.as_view().transposed(),
                 &gpu_output,
+                gpu_tensor.as_view().transposed(),
             )
             .unwrap();
         drop(pass); // Ensure the pass is ended before the encoder is borrowed again.

From dca91bcba7136f97427dd7f92de3311f2e5b20bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Crozet?= <sebcrozet@dimforge.com>
Date: Mon, 27 Oct 2025 18:09:46 +0100
Subject: [PATCH 12/12] fix CI

---
 .github/workflows/ci.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index fae9c22..f377112 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -145,7 +145,11 @@ jobs:
         uses: dtolnay/rust-toolchain@stable
 
       - name: Install dependencies
-        run: sudo apt-get update; sudo apt-get install --no-install-recommends build-essential curl wget file libssl-dev
+        run: |
+          sudo apt-get update
+          sudo apt-get install --no-install-recommends -y \
+            build-essential curl wget file libssl-dev \
+            libegl1-mesa-dev libgl1-mesa-dri libxcb-xfixes0-dev mesa-vulkan-drivers
 
       - name: Retrieve Cache for Slang
         uses: actions/cache/restore@v4
@@ -159,4 +163,4 @@ jobs:
           sweep-cache: true
       - name: Run Cargo Tests
         run: |
-          SLANG_DIR=$SLANG_DIR cargo test --verbose
\ No newline at end of file
+          SLANG_DIR=$SLANG_DIR LIBGL_ALWAYS_SOFTWARE=1 cargo test --verbose
\ No newline at end of file