diff --git a/.gitignore b/.gitignore index 8f0ec505d..b9649e720 100644 --- a/.gitignore +++ b/.gitignore @@ -452,6 +452,18 @@ SERIAL/Sparse/sparse SERIAL/Stencil/stencil SERIAL/Synch_p2p/p2p SERIAL/Transpose/transpose +SWIFT/nstream +SWIFT/transpose +SWIFT/dgemm +SWIFT/dgemm-accelerate +SWIFT/p2p +SWIFT/nstream-metal +SWIFT/transpose-metal +SWIFT/dgemm-metal +SWIFT/p2p-metal +SWIFT/xgemm +SWIFT/xgemm-accelerate +SWIFT/xgemm-metal dgemm-vector.dSYM dgemm.dSYM nstream-opencl.dSYM diff --git a/SWIFT/Makefile b/SWIFT/Makefile new file mode 100644 index 000000000..aab91823b --- /dev/null +++ b/SWIFT/Makefile @@ -0,0 +1,172 @@ +# Swift Parallel Research Kernels Makefile +# +# This makefile builds Swift implementations of PRK benchmarks +# + +SWIFT := swift +SWIFTC := swiftc + +# Compiler flags for optimization +SWIFTFLAGS := -O -whole-module-optimization + +# Default target +all: nstream transpose dgemm dgemm-accelerate p2p nstream-metal transpose-metal dgemm-metal p2p-metal xgemm xgemm-accelerate xgemm-metal + +# Build nstream benchmark +nstream: nstream.swift + $(SWIFTC) $(SWIFTFLAGS) -o nstream nstream.swift + +# Build transpose benchmark +transpose: transpose.swift + $(SWIFTC) $(SWIFTFLAGS) -o transpose transpose.swift + +# Build dgemm benchmark +dgemm: dgemm.swift + $(SWIFTC) $(SWIFTFLAGS) -o dgemm dgemm.swift + +# Build dgemm-accelerate benchmark (using Accelerate framework) +dgemm-accelerate: dgemm-accelerate.swift + $(SWIFTC) $(SWIFTFLAGS) -o dgemm-accelerate dgemm-accelerate.swift + +# Build p2p benchmark +p2p: p2p.swift + $(SWIFTC) $(SWIFTFLAGS) -o p2p p2p.swift + +# Build Metal GPU benchmarks +nstream-metal: nstream-metal.swift + $(SWIFTC) $(SWIFTFLAGS) -o nstream-metal nstream-metal.swift + +transpose-metal: transpose-metal.swift + $(SWIFTC) $(SWIFTFLAGS) -o transpose-metal transpose-metal.swift + +dgemm-metal: dgemm-metal.swift + $(SWIFTC) $(SWIFTFLAGS) -o dgemm-metal dgemm-metal.swift + +p2p-metal: p2p-metal.swift + $(SWIFTC) $(SWIFTFLAGS) -o p2p-metal p2p-metal.swift + +# Build xgemm multi-precision benchmarks +xgemm: xgemm.swift + $(SWIFTC) $(SWIFTFLAGS) -o xgemm xgemm.swift + +xgemm-accelerate: xgemm-accelerate.swift + $(SWIFTC) $(SWIFTFLAGS) -o xgemm-accelerate xgemm-accelerate.swift + +xgemm-metal: xgemm-metal.swift + $(SWIFTC) $(SWIFTFLAGS) -o xgemm-metal xgemm-metal.swift + +# Run tests with default parameters +test: test-nstream test-transpose test-dgemm test-dgemm-accelerate test-p2p test-xgemm test-xgemm-accelerate + +# Run xgemm tests +test-xgemm: xgemm + @echo "Testing xgemm..." + ./xgemm 10 500 + +test-xgemm-accelerate: xgemm-accelerate + @echo "Testing xgemm-accelerate..." + ./xgemm-accelerate 10 500 + +test-xgemm-metal: xgemm-metal + @echo "Testing xgemm-metal..." + ./xgemm-metal 10 500 + +# Run Metal GPU tests +test-metal: test-nstream-metal test-transpose-metal test-dgemm-metal test-p2p-metal test-xgemm-metal + +test-nstream: nstream + @echo "Testing nstream..." + ./nstream 10 1000000 + +test-transpose: transpose + @echo "Testing transpose..." + ./transpose 10 500 + +test-dgemm: dgemm + @echo "Testing dgemm..." + ./dgemm 10 500 + +test-dgemm-accelerate: dgemm-accelerate + @echo "Testing dgemm-accelerate..." + ./dgemm-accelerate 10 500 + +test-p2p: p2p + @echo "Testing p2p..." + ./p2p 10 100 100 + +test-nstream-metal: nstream-metal + @echo "Testing nstream-metal..." + ./nstream-metal 10 1000000 + +test-transpose-metal: transpose-metal + @echo "Testing transpose-metal..." + ./transpose-metal 10 500 + +test-dgemm-metal: dgemm-metal + @echo "Testing dgemm-metal..." + ./dgemm-metal 10 500 + +test-p2p-metal: p2p-metal + @echo "Testing p2p-metal..." + ./p2p-metal 10 100 100 + +# Run benchmarks with larger parameters +benchmark: benchmark-nstream benchmark-transpose benchmark-dgemm benchmark-dgemm-accelerate benchmark-p2p + +benchmark-nstream: nstream + @echo "Benchmarking nstream..." + ./nstream 10 100000000 + +benchmark-transpose: transpose + @echo "Benchmarking transpose..." + ./transpose 10 2000 + +benchmark-dgemm: dgemm + @echo "Benchmarking dgemm..." + ./dgemm 10 1000 + +benchmark-dgemm-accelerate: dgemm-accelerate + @echo "Benchmarking dgemm-accelerate..." + ./dgemm-accelerate 10 1000 + +benchmark-p2p: p2p + @echo "Benchmarking p2p..." + ./p2p 10 2000 2000 + +# Clean build artifacts +clean: + rm -f nstream transpose dgemm dgemm-accelerate p2p nstream-metal transpose-metal dgemm-metal p2p-metal xgemm xgemm-accelerate xgemm-metal + +# Install Swift (requires Homebrew) +install-swift: + @echo "Installing Swift via Homebrew..." + brew install swift + +# Check Swift installation +check-swift: + @echo "Checking Swift installation..." + @which swift >/dev/null 2>&1 && echo "Swift found at: $$(which swift)" || echo "Swift not found" + @swift --version 2>/dev/null || echo "Swift version check failed" + +# Help target +help: + @echo "Swift PRK Makefile" + @echo "" + @echo "Targets:" + @echo " all - Build all benchmarks" + @echo " nstream - Build nstream benchmark" + @echo " transpose - Build transpose benchmark" + @echo " test - Run tests with small parameters" + @echo " benchmark - Run benchmarks with larger parameters" + @echo " clean - Remove build artifacts" + @echo " install-swift - Install Swift via Homebrew" + @echo " check-swift - Check Swift installation" + @echo " help - Show this help message" + @echo "" + @echo "Usage examples:" + @echo " make all" + @echo " make test" + @echo " ./nstream 10 1000000" + @echo " ./transpose 10 1000" + +.PHONY: all test benchmark clean install-swift check-swift help test-nstream test-transpose test-dgemm test-dgemm-accelerate test-p2p test-xgemm test-xgemm-accelerate test-xgemm-metal benchmark-nstream benchmark-transpose benchmark-dgemm benchmark-dgemm-accelerate benchmark-p2p test-metal diff --git a/SWIFT/README.md b/SWIFT/README.md new file mode 100644 index 000000000..7b5fb1918 --- /dev/null +++ b/SWIFT/README.md @@ -0,0 +1,260 @@ +# Swift Parallel Research Kernels (PRK) + +This directory contains Swift implementations of the Parallel Research Kernels benchmarks, specifically the `nstream` and `transpose` kernels. + +## What is Swift? + +Swift is a powerful and modern programming language developed by Apple. Originally created for iOS and macOS development, Swift is now available on multiple platforms including Linux. It combines the performance of compiled languages with the expressiveness and safety of modern programming languages. + +## Prerequisites + +- macOS 10.15 (Catalina) or later +- Xcode 11.0 or later (for full Swift toolchain) +- OR Swift toolchain installed via Homebrew + +## Installation on macOS + +### Option 1: Install Xcode (Recommended) + +The easiest way to get Swift on macOS is through Xcode: + +1. **Install Xcode from App Store** + ```bash + # Open App Store and search for "Xcode" + # Or use the command line: + mas install 497799835 # Xcode + ``` + +2. **Install Xcode Command Line Tools** + ```bash + xcode-select --install + ``` + +3. **Verify Swift installation** + ```bash + swift --version + swiftc --version + ``` + +### Option 2: Install Swift via Homebrew + +If you prefer a lighter installation without the full Xcode: + +```bash +# Install Swift +brew install swift + +# Verify installation +swift --version +swiftc --version +``` + +### Option 3: Download Swift Toolchain + +Download the official Swift toolchain from [swift.org](https://swift.org/download/): + +1. Download the `.pkg` file for macOS +2. Run the installer +3. Add Swift to your PATH: + ```bash + export PATH="/Library/Developer/Toolchains/swift-latest.xctoolchain/usr/bin:$PATH" + ``` + +## Building the Benchmarks + +### Using Make + +```bash +# Navigate to the SWIFT directory +cd PRK/SWIFT + +# Check Swift installation +make check-swift + +# Build all benchmarks +make all + +# Or build individual benchmarks +make nstream +make transpose +``` + +### Manual Compilation + +```bash +# Compile with optimization +swiftc -O -whole-module-optimization -o nstream nstream.swift +swiftc -O -whole-module-optimization -o transpose transpose.swift + +# Compile for debugging +swiftc -g -o nstream nstream.swift +swiftc -g -o transpose transpose.swift +``` + +## Running the Benchmarks + +### NSTREAM (Stream Triad) + +The nstream benchmark measures memory bandwidth using the stream triad operation: `A = B + scalar * C` + +```bash +# Syntax: ./nstream + +# Quick test +./nstream 10 1000000 + +# Longer benchmark +./nstream 100 10000000 +``` + +**Example Output:** +``` +Parallel Research Kernels +Swift STREAM triad: A = B + scalar * C +Number of iterations = 10 +Vector length = 1000000 +Solution validates +Rate (MB/s): 8543.210987 Avg time (s): 0.003756 +``` + +### Transpose + +The transpose benchmark measures the time for matrix transpose: `B = A^T` + +```bash +# Syntax: ./transpose + +# Quick test +./transpose 10 1000 + +# Longer benchmark +./transpose 100 2000 +``` + +**Example Output:** +``` +Parallel Research Kernels +Swift Matrix transpose: B = A^T +Number of iterations = 10 +Matrix order = 1000 +Solution validates +Rate (MB/s): 2456.789123 Avg time (s): 0.006543 +``` + +## Using the Makefile + +The provided Makefile includes several convenient targets: + +```bash +# Build everything +make all + +# Run quick tests +make test + +# Run performance benchmarks +make benchmark + +# Clean build artifacts +make clean + +# Check Swift installation +make check-swift + +# Install Swift via Homebrew +make install-swift + +# Show help +make help +``` + +## Performance Characteristics + +### NSTREAM Performance Factors + +- **Vector Length**: Larger vectors generally show higher bandwidth (until memory limits) +- **Iterations**: More iterations provide more accurate timing measurements +- **Memory Hierarchy**: Performance depends on L1/L2/L3 cache sizes and main memory bandwidth + +### Transpose Performance Factors + +- **Matrix Size**: Larger matrices may show cache effects +- **Memory Access Pattern**: Transpose involves non-contiguous memory access +- **Cache Blocking**: For very large matrices, tiled algorithms perform better + +## Troubleshooting + +### Common Issues + +1. **"swift: command not found"** + ```bash + # Check if Swift is installed + which swift + + # Install Swift via Homebrew + brew install swift + + # Or install Xcode from App Store + ``` + +2. **"No such module 'Foundation'"** + - This usually means Swift is not properly installed + - Reinstall Swift or Xcode + +3. **Permission denied when running executables** + ```bash + chmod +x nstream transpose + ``` + +4. **Poor performance compared to C++** + - Ensure you're compiling with optimization flags: `-O -whole-module-optimization` + - Swift performance is generally competitive with C++ when optimized + +### Performance Tuning + +1. **Compile with optimization**: + ```bash + swiftc -O -whole-module-optimization -o nstream nstream.swift + ``` + +2. **For maximum performance**, also try: + ```bash + swiftc -O -whole-module-optimization -Xcc -O3 -o nstream nstream.swift + ``` + +3. **Profile your code**: + ```bash + # Compile with debug info for profiling + swiftc -O -g -o nstream nstream.swift + + # Use Instruments (macOS) for detailed profiling + instruments -t "Time Profiler" ./nstream 100 10000000 + ``` + +## Implementation Notes + +### Design Decisions + +1. **Array vs UnsafePointer**: Used Swift Arrays for safety and ease of use +2. **CFAbsoluteTime**: Used for high-precision timing measurements +3. **Functional Style**: Leveraged Swift's functional programming features where appropriate +4. **Memory Management**: Relied on Swift's automatic reference counting (ARC) + +### Comparison with Other Languages + +- **vs C++**: Swift provides similar performance with better safety guarantees +- **vs Python**: Swift is significantly faster, closer to C++ performance levels +- **vs Java**: Swift typically shows better performance and lower memory overhead + +## Further Reading + +- [Swift Programming Language Guide](https://swift.org/documentation/) +- [Swift Performance Tips](https://github.com/apple/swift/blob/main/docs/OptimizationTips.rst) +- [Parallel Research Kernels Project](https://github.com/ParRes/Kernels) + +## Files + +- `nstream.swift`: Swift implementation of the STREAM triad benchmark +- `transpose.swift`: Swift implementation of the matrix transpose benchmark +- `Makefile`: Build system for compiling and testing +- `README.md`: This documentation file diff --git a/SWIFT/dgemm-accelerate.swift b/SWIFT/dgemm-accelerate.swift new file mode 100644 index 000000000..15f5f943b --- /dev/null +++ b/SWIFT/dgemm-accelerate.swift @@ -0,0 +1,165 @@ +/// +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm-accelerate +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out using Apple's Accelerate +/// framework BLAS implementation +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Accelerate by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Accelerate + +func main() { + print("Parallel Research Kernels") + print("Swift Dense matrix-matrix multiplication: C += A x B (Accelerate BLAS)") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift dgemm-accelerate.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: matrix order must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + var A = Array(repeating: 0.0, count: order * order) + var B = Array(repeating: 0.0, count: order * order) + var C = Array(repeating: 0.0, count: order * order) + + // Initialize matrices A and B + for i in 0.. <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Metal by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Metal + +let metalSource = """ +#include +using namespace metal; + +kernel void dgemm_kernel(device float* A [[buffer(0)]], + device float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant uint& order [[buffer(3)]], + uint2 gid [[thread_position_in_grid]]) { + if (gid.x >= order || gid.y >= order) return; + + uint i = gid.y; + uint j = gid.x; + + float sum = 0.0; + for (uint k = 0; k < order; k++) { + sum += A[i * order + k] * B[k * order + j]; + } + C[i * order + j] += sum; +} +""" + +func main() { + print("Parallel Research Kernels") + print("Swift Dense matrix-matrix multiplication: C += A x B (Metal GPU)") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift dgemm-metal.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: matrix order must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + ////////////////////////////////////////////////////////////////////// + // Setup Metal + ////////////////////////////////////////////////////////////////////// + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + exit(1) + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + exit(1) + } + + guard let library = try? device.makeLibrary(source: metalSource, options: nil) else { + print("ERROR: Failed to create Metal library") + exit(1) + } + + guard let function = library.makeFunction(name: "dgemm_kernel") else { + print("ERROR: Failed to find kernel function") + exit(1) + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + var A = Array(repeating: Float(0.0), count: order * order) + var B = Array(repeating: Float(0.0), count: order * order) + let C = Array(repeating: Float(0.0), count: order * order) + + // Initialize matrices A and B + for i in 0...size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer A") + exit(1) + } + + guard let bufferB = device.makeBuffer(bytes: B, length: order * order * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer B") + exit(1) + } + + guard let bufferC = device.makeBuffer(bytes: C, length: order * order * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer C") + exit(1) + } + + guard let bufferOrder = device.makeBuffer(bytes: &orderConstant, length: MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create order buffer") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Execute computation + ////////////////////////////////////////////////////////////////////// + + var startTime = 0.0 + + for iter in 0...iterations { + + // Start timer after warmup iteration + if iter == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + // Create command buffer + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + exit(1) + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + exit(1) + } + + // Set compute pipeline and buffers + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferA, offset: 0, index: 0) + computeEncoder.setBuffer(bufferB, offset: 0, index: 1) + computeEncoder.setBuffer(bufferC, offset: 0, index: 2) + computeEncoder.setBuffer(bufferOrder, offset: 0, index: 3) + + // Calculate thread group sizes for 2D dispatch + let threadsPerThreadgroup = MTLSize(width: 16, height: 16, depth: 1) + let threadgroupsPerGrid = MTLSize( + width: (order + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, + height: (order + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height, + depth: 1 + ) + + // Dispatch threads + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + // Execute + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + let dgemmTime = CFAbsoluteTimeGetCurrent() - startTime + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // Calculate average time + let dgemmAve = dgemmTime / Double(iterations) + + // Copy results back from GPU + let resultPointer = bufferC.contents().bindMemory(to: Float.self, capacity: order * order) + let results = Array(UnsafeBufferPointer(start: resultPointer, count: order * order)) + + // Calculate checksum + let checksum = results.reduce(0.0) { Double($0) + Double($1) } + + // Calculate reference checksum + let refChecksum = 0.25 * Double(order * order * order) * Double(order - 1) * Double(order - 1) * Double(iterations + 1) + + let epsilon = 1.0e-6 // Relaxed for Float32 + if abs(checksum - refChecksum) / refChecksum < epsilon { + print("Solution validates") + let nflops = 2.0 * Double(order * order * order) + print("nflops: \(nflops)") + print(String(format: "Rate: %.6f Avg time (s): %.6f", 1.0e-6 * nflops / dgemmAve, dgemmAve)) + } else { + print("ERROR: Checksum = \(checksum), Reference checksum = \(refChecksum)") + print("ERROR: solution did not validate") + exit(1) + } +} + +main() diff --git a/SWIFT/dgemm.swift b/SWIFT/dgemm.swift new file mode 100644 index 000000000..a904d6922 --- /dev/null +++ b/SWIFT/dgemm.swift @@ -0,0 +1,146 @@ +/// +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation + +func main() { + print("Parallel Research Kernels") + print("Swift Dense matrix-matrix multiplication: C += A x B") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift dgemm.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: matrix order must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + var A = Array(repeating: 0.0, count: order * order) + var B = Array(repeating: 0.0, count: order * order) + var C = Array(repeating: 0.0, count: order * order) + + // Initialize matrices A and B + for i in 0.. <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Metal by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Metal + +let metalSource = """ +#include +using namespace metal; + +kernel void nstream_triad(device float* A [[buffer(0)]], + const device float* B [[buffer(1)]], + const device float* C [[buffer(2)]], + constant float& scalar [[buffer(3)]], + uint index [[thread_position_in_grid]]) { + A[index] += B[index] + scalar * C[index]; +} +""" + +func main() { + print("Parallel Research Kernels") + print("Swift STREAM triad: A = B + scalar * C (Metal GPU)") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift nstream-metal.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let length = Int(arguments[2]), length > 0 else { + print("ERROR: vector length must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Vector length = \(length)") + + ////////////////////////////////////////////////////////////////////// + // Setup Metal + ////////////////////////////////////////////////////////////////////// + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + exit(1) + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + exit(1) + } + + guard let library = try? device.makeLibrary(source: metalSource, options: nil) else { + print("ERROR: Failed to create Metal library") + exit(1) + } + + guard let function = library.makeFunction(name: "nstream_triad") else { + print("ERROR: Failed to find kernel function") + exit(1) + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Allocate and initialize data + ////////////////////////////////////////////////////////////////////// + + // Use Float32 for Metal compatibility + let A = Array(repeating: Float(0.0), count: length) + let B = Array(repeating: Float(2.0), count: length) + let C = Array(repeating: Float(2.0), count: length) + var scalar = Float(3.0) + + // Create Metal buffers + guard let bufferA = device.makeBuffer(bytes: A, length: length * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer A") + exit(1) + } + + guard let bufferB = device.makeBuffer(bytes: B, length: length * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer B") + exit(1) + } + + guard let bufferC = device.makeBuffer(bytes: C, length: length * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer C") + exit(1) + } + + guard let bufferScalar = device.makeBuffer(bytes: &scalar, length: MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create scalar buffer") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Execute computation + ////////////////////////////////////////////////////////////////////// + + var startTime = 0.0 + + for iter in 0...iterations { + + // Start timer after warmup iteration + if iter == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + // Create command buffer + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + exit(1) + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + exit(1) + } + + // Set compute pipeline and buffers + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferA, offset: 0, index: 0) + computeEncoder.setBuffer(bufferB, offset: 0, index: 1) + computeEncoder.setBuffer(bufferC, offset: 0, index: 2) + computeEncoder.setBuffer(bufferScalar, offset: 0, index: 3) + + // Calculate thread group sizes + let threadsPerThreadgroup = MTLSize(width: min(computePipelineState.maxTotalThreadsPerThreadgroup, length), height: 1, depth: 1) + let threadgroupsPerGrid = MTLSize(width: (length + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, height: 1, depth: 1) + + // Dispatch threads + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + // Execute + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + let nstreamTime = CFAbsoluteTimeGetCurrent() - startTime + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // Copy results back from GPU + let resultPointer = bufferA.contents().bindMemory(to: Float.self, capacity: length) + let results = Array(UnsafeBufferPointer(start: resultPointer, count: length)) + + // Calculate reference result (using Double for precision) + var ar = 0.0 + let br = 2.0 + let cr = 2.0 + + for _ in 0...iterations { + ar += br + 3.0 * cr + } + + ar *= Double(length) + + // Calculate checksum + let asum = results.reduce(0.0) { Double($0) + Double(abs($1)) } + + let epsilon = 1.0e-6 // Relaxed for Float32 + if abs(ar - asum) / asum > epsilon { + print("Failed Validation on output array") + print(" Expected checksum: \(ar)") + print(" Observed checksum: \(asum)") + print("ERROR: solution did not validate") + exit(1) + } else { + print("Solution validates") + let avgtime = nstreamTime / Double(iterations) + let nbytes = 4.0 * Double(length) * 4.0 // 4 bytes per float + let rate = 1.0e-6 * nbytes / avgtime + print(String(format: "Rate (MB/s): %.6f Avg time (s): %.6f", rate, avgtime)) + } +} + +main() diff --git a/SWIFT/nstream.swift b/SWIFT/nstream.swift new file mode 100644 index 000000000..eb0616d4f --- /dev/null +++ b/SWIFT/nstream.swift @@ -0,0 +1,154 @@ +/// +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors and +/// the length of the vectors. +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to Swift by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation + +func main() { + print("Parallel Research Kernels") + print("Swift STREAM triad: A = B + scalar * C") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift nstream.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let length = Int(arguments[2]), length > 0 else { + print("ERROR: vector length must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Vector length = \(length)") + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + var A = Array(repeating: 0.0, count: length) + let B = Array(repeating: 2.0, count: length) + let C = Array(repeating: 2.0, count: length) + + let scalar = 3.0 + var startTime = 0.0 + + for iter in 0...iterations { + + // Start timer after warmup iteration + if iter == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + // Perform STREAM triad: A = B + scalar * C + for i in 0.. epsilon { + print("Failed Validation on output array") + print(" Expected checksum: \(ar)") + print(" Observed checksum: \(asum)") + print("ERROR: solution did not validate") + exit(1) + } else { + print("Solution validates") + let avgtime = nstreamTime / Double(iterations) + let nbytes = 4.0 * Double(length) * 8.0 // 8 bytes per double + let rate = 1.0e-6 * nbytes / avgtime + print(String(format: "Rate (MB/s): %.6f Avg time (s): %.6f", rate, avgtime)) + } +} + +main() diff --git a/SWIFT/p2p-metal.swift b/SWIFT/p2p-metal.swift new file mode 100644 index 000000000..edb69b794 --- /dev/null +++ b/SWIFT/p2p-metal.swift @@ -0,0 +1,265 @@ +/// +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: p2p-metal +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid using Metal GPU compute. +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Metal by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Metal + +let metalSource = """ +#include +using namespace metal; + +kernel void p2p_kernel(device float* grid [[buffer(0)]], + constant uint& n [[buffer(1)]], + constant uint& diagonal [[buffer(2)]], + uint thread_id [[thread_position_in_grid]]) { + uint j = thread_id + 1; // thread_id maps to j coordinate (1-based) + uint i = diagonal; + + // Check bounds for diagonal sweep (following OpenCL logic exactly) + if (j >= max(2u, i - n + 2) && j <= min(i, n)) { + uint x = i - j + 2 - 1; // Convert to 0-based x coordinate + uint y = j - 1; // Convert to 0-based y coordinate + + // Additional bounds check + if (x >= 1 && x < n && y >= 1 && y < n) { + grid[x * n + y] = grid[(x-1) * n + y] + + grid[x * n + (y-1)] + - grid[(x-1) * n + (y-1)]; + } + } +} +""" + +func main() { + print("Parallel Research Kernels") + print("Swift pipeline execution on 2D grid (Metal GPU)") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 4 else { + print("Usage: swift p2p-metal.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let m = Int(arguments[2]), m >= 1 else { + print("ERROR: array dimension must be >= 1") + exit(1) + } + + guard let n = Int(arguments[3]), n >= 1 else { + print("ERROR: array dimension must be >= 1") + exit(1) + } + + print("Grid sizes = \(m) * \(n)") + print("Number of iterations = \(iterations)") + + // For the OpenCL algorithm, we need to use the larger dimension + let gridSize = max(m, n) + + ////////////////////////////////////////////////////////////////////// + // Setup Metal + ////////////////////////////////////////////////////////////////////// + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + exit(1) + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + exit(1) + } + + guard let library = try? device.makeLibrary(source: metalSource, options: nil) else { + print("ERROR: Failed to create Metal library") + exit(1) + } + + guard let function = library.makeFunction(name: "p2p_kernel") else { + print("ERROR: Failed to find kernel function") + exit(1) + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and initialize grid + ////////////////////////////////////////////////////////////////////// + + var grid = Array(repeating: Float(0.0), count: m * n) + + // Initialize grid boundaries + for j in 0...size, options: [.storageModeShared]) else { + print("ERROR: Failed to create grid buffer") + exit(1) + } + + guard let bufferN = device.makeBuffer(bytes: &nConstant, length: MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create n buffer") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Execute computation + ////////////////////////////////////////////////////////////////////// + + var startTime = 0.0 + + for k in 0...iterations { + + // Start timer after warmup iteration + if k == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + // Execute pipeline algorithm using diagonal sweep pattern (like OpenCL version) + for i in 2...(2*gridSize-2) { + var diagonalConstant = UInt32(i) + + guard let bufferDiagonal = device.makeBuffer(bytes: &diagonalConstant, length: MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create diagonal buffer") + exit(1) + } + + // Create command buffer for this diagonal + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + exit(1) + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + exit(1) + } + + // Set compute pipeline and buffers + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferGrid, offset: 0, index: 0) + computeEncoder.setBuffer(bufferN, offset: 0, index: 1) + computeEncoder.setBuffer(bufferDiagonal, offset: 0, index: 2) + + // Calculate thread group sizes for 1D dispatch + let threadsPerThreadgroup = MTLSize(width: min(computePipelineState.maxTotalThreadsPerThreadgroup, gridSize), height: 1, depth: 1) + let threadgroupsPerGrid = MTLSize(width: (gridSize + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, height: 1, depth: 1) + + // Dispatch threads + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + // Execute and wait for completion (synchronization barrier) + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + // Copy top right corner value to bottom left corner to create dependency + let resultPointer = bufferGrid.contents().bindMemory(to: Float.self, capacity: m * n) + let topRightValue = -resultPointer[(m-1) * n + (n-1)] + resultPointer[0 * n + 0] = topRightValue + } + + let pipelineTime = CFAbsoluteTimeGetCurrent() - startTime + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // Copy results back from GPU + let resultPointer = bufferGrid.contents().bindMemory(to: Float.self, capacity: m * n) + let results = Array(UnsafeBufferPointer(start: resultPointer, count: m * n)) + + let epsilon = 1.0e-6 // Relaxed for Float32 + + // Verify correctness, using top right value + let cornerVal = Double((iterations + 1) * (2 * gridSize - 2)) + let observedCornerVal = Double(results[(m-1) * n + (n-1)]) + + print("DEBUG: gridSize=\(gridSize), m=\(m), n=\(n)") + print("DEBUG: Expected corner value: \(cornerVal)") + print("DEBUG: Observed corner value: \(observedCornerVal)") + print("DEBUG: Grid corner values: [\(results[0]), \(results[n-1]), \(results[(m-1)*n]), \(results[(m-1)*n+(n-1)])]") + + if abs(observedCornerVal - cornerVal) / cornerVal < epsilon { + print("Solution validates") + let avgtime = pipelineTime / Double(iterations) + let rate = 1.0e-6 * 2.0 * Double(m-1) * Double(n-1) / avgtime + print(String(format: "Rate (MFlops/s): %.6f; Avg time (s): %.6f", rate, avgtime)) + } else { + print("ERROR: checksum \(observedCornerVal) does not match verification value \(cornerVal)") + print("ERROR: solution did not validate") + exit(1) + } +} + +main() diff --git a/SWIFT/p2p.swift b/SWIFT/p2p.swift new file mode 100644 index 000000000..c60116df0 --- /dev/null +++ b/SWIFT/p2p.swift @@ -0,0 +1,145 @@ +/// +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation + +func main() { + print("Parallel Research Kernels") + print("Swift pipeline execution on 2D grid") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 4 else { + print("Usage: swift p2p.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let m = Int(arguments[2]), m >= 1 else { + print("ERROR: array dimension must be >= 1") + exit(1) + } + + guard let n = Int(arguments[3]), n >= 1 else { + print("ERROR: array dimension must be >= 1") + exit(1) + } + + print("Grid sizes = \(m) * \(n)") + print("Number of iterations = \(iterations)") + + ////////////////////////////////////////////////////////////////////// + // Allocate space and initialize grid + ////////////////////////////////////////////////////////////////////// + + var grid = Array(repeating: Array(repeating: 0.0, count: n), count: m) + + // Initialize grid boundaries + for j in 0.. <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Metal by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Metal + +let metalSource = """ +#include +using namespace metal; + +kernel void transpose_kernel(device float* A [[buffer(0)]], + device float* B [[buffer(1)]], + constant uint& order [[buffer(2)]], + uint2 gid [[thread_position_in_grid]]) { + if (gid.x >= order || gid.y >= order) return; + + uint i = gid.y; + uint j = gid.x; + + B[i * order + j] += A[j * order + i]; + A[j * order + i] += 1.0; +} +""" + +func main() { + print("Parallel Research Kernels") + print("Swift Matrix transpose: B = A^T (Metal GPU)") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift transpose-metal.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: matrix order must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + ////////////////////////////////////////////////////////////////////// + // Setup Metal + ////////////////////////////////////////////////////////////////////// + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + exit(1) + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + exit(1) + } + + guard let library = try? device.makeLibrary(source: metalSource, options: nil) else { + print("ERROR: Failed to create Metal library") + exit(1) + } + + guard let function = library.makeFunction(name: "transpose_kernel") else { + print("ERROR: Failed to find kernel function") + exit(1) + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + // Initialize matrices as 1D arrays using Float32 + var A = Array(repeating: Float(0.0), count: order * order) + let B = Array(repeating: Float(0.0), count: order * order) + + // Initialize matrix A with sequence values + for i in 0...size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer A") + exit(1) + } + + guard let bufferB = device.makeBuffer(bytes: B, length: order * order * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create buffer B") + exit(1) + } + + guard let bufferOrder = device.makeBuffer(bytes: &orderConstant, length: MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create order buffer") + exit(1) + } + + ////////////////////////////////////////////////////////////////////// + // Execute computation + ////////////////////////////////////////////////////////////////////// + + var startTime = 0.0 + + for iter in 0...iterations { + + // Start timer after warmup iteration + if iter == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + // Create command buffer + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + exit(1) + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + exit(1) + } + + // Set compute pipeline and buffers + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferA, offset: 0, index: 0) + computeEncoder.setBuffer(bufferB, offset: 0, index: 1) + computeEncoder.setBuffer(bufferOrder, offset: 0, index: 2) + + // Calculate thread group sizes for 2D dispatch + let threadsPerThreadgroup = MTLSize(width: 16, height: 16, depth: 1) + let threadgroupsPerGrid = MTLSize( + width: (order + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, + height: (order + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height, + depth: 1 + ) + + // Dispatch threads + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + // Execute + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + let transTime = CFAbsoluteTimeGetCurrent() - startTime + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // Copy results back from GPU + let resultPointerB = bufferB.contents().bindMemory(to: Float.self, capacity: order * order) + let resultsB = Array(UnsafeBufferPointer(start: resultPointerB, count: order * order)) + + // Calculate additive term for validation + let addit = Double(iterations * (iterations + 1)) / 2.0 + var abserr = 0.0 + + for i in 0.. <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift by Cursor AI, 2025. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation + +func main() { + print("Parallel Research Kernels") + print("Swift Matrix transpose: B = A^T") + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let arguments = CommandLine.arguments + + guard arguments.count == 3 else { + print("Usage: swift transpose.swift <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: matrix order must be positive") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + // Initialize matrices as 1D arrays for better performance + var A = Array(repeating: 0.0, count: order * order) + var B = Array(repeating: 0.0, count: order * order) + + // Initialize matrix A with sequence values + for i in 0.. <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Accelerate by AI Assistant, December 2024. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Accelerate + +// Matrix multiplication using Accelerate for Float32 +func accelerateMatrixMultiplyFloat( + A: [Float], B: [Float], C: inout [Float], + order: Int, alpha: Float, beta: Float +) { + cblas_sgemm( + CblasRowMajor, // Layout + CblasNoTrans, // TransA + CblasNoTrans, // TransB + Int32(order), // M + Int32(order), // N + Int32(order), // K + alpha, // alpha + A, Int32(order), // A, lda + B, Int32(order), // B, ldb + beta, // beta + &C, Int32(order) // C, ldc + ) +} + +// Matrix multiplication using Accelerate for Float64 +func accelerateMatrixMultiplyDouble( + A: [Double], B: [Double], C: inout [Double], + order: Int, alpha: Double, beta: Double +) { + cblas_dgemm( + CblasRowMajor, // Layout + CblasNoTrans, // TransA + CblasNoTrans, // TransB + Int32(order), // M + Int32(order), // N + Int32(order), // K + alpha, // alpha + A, Int32(order), // A, lda + B, Int32(order), // B, ldb + beta, // beta + &C, Int32(order) // C, ldc + ) +} + +// Fallback naive implementation for Float16 (Accelerate doesn't support it directly) +func naiveMatrixMultiply( + A: [T], B: [T], C: inout [T], + order: Int, alpha: T, beta: T +) { + for i in 0..(repeating: Float16(0), count: nelems) + var B = Array(repeating: Float16(0), count: nelems) + var C = Array(repeating: Float16(0), count: nelems) + + for i in 0..(repeating: Float(0), count: nelems) + var B = Array(repeating: Float(0), count: nelems) + var C = Array(repeating: Float(0), count: nelems) + + for i in 0..(repeating: Double(0), count: nelems) + var B = Array(repeating: Double(0), count: nelems) + var C = Array(repeating: Double(0), count: nelems) + + for i in 0..= 3 else { + print("Usage: \(arguments[0]) <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: Matrix Order must be greater than 0") + exit(1) + } + + guard order <= 2000 else { + print("ERROR: matrix dimension too large - overflow risk") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + // Test all supported precision types + if #available(macOS 11.0, *) { + runBenchmarkFloat16(iterations: iterations, order: order) + } else { + print("Float16 not available on this macOS version") + } + + runBenchmarkFloat32(iterations: iterations, order: order) + runBenchmarkFloat64(iterations: iterations, order: order) +} + +main() diff --git a/SWIFT/xgemm-metal.swift b/SWIFT/xgemm-metal.swift new file mode 100644 index 000000000..483b2f2d3 --- /dev/null +++ b/SWIFT/xgemm-metal.swift @@ -0,0 +1,413 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: xgemm-metal +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out using Apple's Metal +/// framework with multiple precision types +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift with Metal by AI Assistant, December 2024. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation +import Metal + +let metalSource16 = """ +#include +using namespace metal; + +kernel void gemm16_kernel(const device half* A [[buffer(0)]], + const device half* B [[buffer(1)]], + device half* C [[buffer(2)]], + constant uint& N [[buffer(3)]], + constant half& alpha [[buffer(4)]], + constant half& beta [[buffer(5)]], + uint2 gid [[thread_position_in_grid]]) { + uint row = gid.y; + uint col = gid.x; + + if (row >= N || col >= N) return; + + half sum = 0.0h; + for (uint k = 0; k < N; k++) { + sum += A[row * N + k] * B[k * N + col]; + } + C[row * N + col] = alpha * sum + beta * C[row * N + col]; +} +""" + +let metalSource32 = """ +#include +using namespace metal; + +kernel void gemm32_kernel(const device float* A [[buffer(0)]], + const device float* B [[buffer(1)]], + device float* C [[buffer(2)]], + constant uint& N [[buffer(3)]], + constant float& alpha [[buffer(4)]], + constant float& beta [[buffer(5)]], + uint2 gid [[thread_position_in_grid]]) { + uint row = gid.y; + uint col = gid.x; + + if (row >= N || col >= N) return; + + float sum = 0.0f; + for (uint k = 0; k < N; k++) { + sum += A[row * N + k] * B[k * N + col]; + } + C[row * N + col] = alpha * sum + beta * C[row * N + col]; +} +""" + +// Function to run benchmark for Float16 with Metal +@available(macOS 11.0, *) +func runBenchmarkFloat16Metal(iterations: Int, order: Int) { + print("Testing precision: FP16 (Metal GPU)") + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + return + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + return + } + + guard let library = try? device.makeLibrary(source: metalSource16, options: nil) else { + print("ERROR: Failed to create Metal library") + return + } + + guard let function = library.makeFunction(name: "gemm16_kernel") else { + print("ERROR: Failed to find kernel function") + return + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + return + } + + var dgemmTime: Double = 0 + let nelems = order * order + + // Initialize matrices on CPU + var A = Array(repeating: Float16(0), count: nelems) + var B = Array(repeating: Float16(0), count: nelems) + var C = Array(repeating: Float16(0), count: nelems) + + for i in 0...size, options: [.storageModeShared]) else { + print("ERROR: Failed to create A buffer") + return + } + guard let bufferB = device.makeBuffer(bytes: B, length: nelems * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create B buffer") + return + } + let bufferC = device.makeBuffer(bytes: C, length: nelems * MemoryLayout.size, options: [.storageModeShared])! + + var orderConstant = UInt32(order) + var alphaConstant = Float16(1.0) + var betaConstant = Float16(1.0) + + let bufferOrder = device.makeBuffer(bytes: &orderConstant, length: MemoryLayout.size, options: [.storageModeShared])! + let bufferAlpha = device.makeBuffer(bytes: &alphaConstant, length: MemoryLayout.size, options: [.storageModeShared])! + let bufferBeta = device.makeBuffer(bytes: &betaConstant, length: MemoryLayout.size, options: [.storageModeShared])! + + var startTime: Double = 0 + + // Benchmark loop + for k in 0...iterations { + if k == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + return + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + return + } + + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferA, offset: 0, index: 0) + computeEncoder.setBuffer(bufferB, offset: 0, index: 1) + computeEncoder.setBuffer(bufferC, offset: 0, index: 2) + computeEncoder.setBuffer(bufferOrder, offset: 0, index: 3) + computeEncoder.setBuffer(bufferAlpha, offset: 0, index: 4) + computeEncoder.setBuffer(bufferBeta, offset: 0, index: 5) + + let threadsPerThreadgroup = MTLSize(width: 16, height: 16, depth: 1) + let threadgroupsPerGrid = MTLSize( + width: (order + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, + height: (order + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height, + depth: 1 + ) + + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + dgemmTime = CFAbsoluteTimeGetCurrent() - startTime + let dgemmAve = dgemmTime / Double(iterations) + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results + ////////////////////////////////////////////////////////////////////// + + let resultPointer = bufferC.contents().bindMemory(to: Float16.self, capacity: nelems) + let results = Array(UnsafeBufferPointer(start: resultPointer, count: nelems)) + + let checksum = results.reduce(Float16(0), +) + let forder = Double(order) + let refChecksum = Float16(0.25 * forder * forder * forder * (forder - 1.0) * (forder - 1.0) * Double(iterations + 1)) + + let epsilon: Double = 1.0e-3 // Relaxed for 16-bit + let residuum = abs(Double(checksum) - Double(refChecksum)) / Double(refChecksum) + + if residuum < epsilon { + print("Solution validates") + let nflops = 2.0 * forder * forder * forder + let rate = 1.0e-6 * nflops / dgemmAve + print(String(format: "FP16 Rate (MF/s): %.6f; Avg time (s): %.6f", rate, dgemmAve)) + } else { + print(String(format: "ERROR: Checksum = %.6f; Reference = %.6f; Residuum = %.6e", + Double(checksum), Double(refChecksum), residuum)) + } +} + +// Function to run benchmark for Float32 with Metal +func runBenchmarkFloat32Metal(iterations: Int, order: Int) { + print("Testing precision: FP32 (Metal GPU)") + + guard let device = MTLCreateSystemDefaultDevice() else { + print("ERROR: Metal is not supported on this device") + return + } + + guard let commandQueue = device.makeCommandQueue() else { + print("ERROR: Failed to create command queue") + return + } + + guard let library = try? device.makeLibrary(source: metalSource32, options: nil) else { + print("ERROR: Failed to create Metal library") + return + } + + guard let function = library.makeFunction(name: "gemm32_kernel") else { + print("ERROR: Failed to find kernel function") + return + } + + guard let computePipelineState = try? device.makeComputePipelineState(function: function) else { + print("ERROR: Failed to create compute pipeline state") + return + } + + var dgemmTime: Double = 0 + let nelems = order * order + + // Initialize matrices on CPU + var A = Array(repeating: Float32(0), count: nelems) + var B = Array(repeating: Float32(0), count: nelems) + var C = Array(repeating: Float32(0), count: nelems) + + for i in 0...size, options: [.storageModeShared]) else { + print("ERROR: Failed to create A buffer") + return + } + guard let bufferB = device.makeBuffer(bytes: B, length: nelems * MemoryLayout.size, options: [.storageModeShared]) else { + print("ERROR: Failed to create B buffer") + return + } + let bufferC = device.makeBuffer(bytes: C, length: nelems * MemoryLayout.size, options: [.storageModeShared])! + + var orderConstant = UInt32(order) + var alphaConstant = Float32(1.0) + var betaConstant = Float32(1.0) + + let bufferOrder = device.makeBuffer(bytes: &orderConstant, length: MemoryLayout.size, options: [.storageModeShared])! + let bufferAlpha = device.makeBuffer(bytes: &alphaConstant, length: MemoryLayout.size, options: [.storageModeShared])! + let bufferBeta = device.makeBuffer(bytes: &betaConstant, length: MemoryLayout.size, options: [.storageModeShared])! + + var startTime: Double = 0 + + // Benchmark loop + for k in 0...iterations { + if k == 1 { + startTime = CFAbsoluteTimeGetCurrent() + } + + guard let commandBuffer = commandQueue.makeCommandBuffer() else { + print("ERROR: Failed to create command buffer") + return + } + + guard let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { + print("ERROR: Failed to create compute encoder") + return + } + + computeEncoder.setComputePipelineState(computePipelineState) + computeEncoder.setBuffer(bufferA, offset: 0, index: 0) + computeEncoder.setBuffer(bufferB, offset: 0, index: 1) + computeEncoder.setBuffer(bufferC, offset: 0, index: 2) + computeEncoder.setBuffer(bufferOrder, offset: 0, index: 3) + computeEncoder.setBuffer(bufferAlpha, offset: 0, index: 4) + computeEncoder.setBuffer(bufferBeta, offset: 0, index: 5) + + let threadsPerThreadgroup = MTLSize(width: 16, height: 16, depth: 1) + let threadgroupsPerGrid = MTLSize( + width: (order + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width, + height: (order + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height, + depth: 1 + ) + + computeEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) + computeEncoder.endEncoding() + + commandBuffer.commit() + commandBuffer.waitUntilCompleted() + } + + dgemmTime = CFAbsoluteTimeGetCurrent() - startTime + let dgemmAve = dgemmTime / Double(iterations) + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results + ////////////////////////////////////////////////////////////////////// + + let resultPointer = bufferC.contents().bindMemory(to: Float32.self, capacity: nelems) + let results = Array(UnsafeBufferPointer(start: resultPointer, count: nelems)) + + let checksum = results.reduce(Float32(0), +) + let forder = Double(order) + let refChecksum = Float32(0.25 * forder * forder * forder * (forder - 1.0) * (forder - 1.0) * Double(iterations + 1)) + + let epsilon: Double = 1.0e-6 // Standard for 32-bit + let residuum = abs(Double(checksum) - Double(refChecksum)) / Double(refChecksum) + + if residuum < epsilon { + print("Solution validates") + let nflops = 2.0 * forder * forder * forder + let rate = 1.0e-6 * nflops / dgemmAve + print(String(format: "FP32 Rate (MF/s): %.6f; Avg time (s): %.6f", rate, dgemmAve)) + } else { + print(String(format: "ERROR: Checksum = %.6f; Reference = %.6f; Residuum = %.6e", + Double(checksum), Double(refChecksum), residuum)) + } +} + +func main() { + print("Parallel Research Kernels") + print("Swift Dense matrix-matrix multiplication: C += A x B (Multi-precision Metal GPU)") + + let arguments = CommandLine.arguments + guard arguments.count >= 3 else { + print("Usage: \(arguments[0]) <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: Matrix Order must be greater than 0") + exit(1) + } + + guard order <= 2000 else { + print("ERROR: matrix dimension too large - overflow risk") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + // Test supported precision types + if #available(macOS 11.0, *) { + runBenchmarkFloat16Metal(iterations: iterations, order: order) + } else { + print("Float16 not available on this macOS version") + } + + runBenchmarkFloat32Metal(iterations: iterations, order: order) + + // Note: Metal doesn't have native FP64 support on most hardware + print("Note: FP64 not supported in Metal on most Apple GPUs") +} + +main() + diff --git a/SWIFT/xgemm.swift b/SWIFT/xgemm.swift new file mode 100644 index 000000000..9f1dc8c65 --- /dev/null +++ b/SWIFT/xgemm.swift @@ -0,0 +1,202 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2023, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: xgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out using multiple precision types +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to Swift by AI Assistant, December 2024. +/// +////////////////////////////////////////////////////////////////////// + +import Foundation + +// Generic matrix multiplication function +func matrixMultiply( + A: [T], B: [T], C: inout [T], + order: Int, alpha: T, beta: T +) { + for i in 0..( + type: T.Type, iterations: Int, order: Int +) { + print("Testing precision: \(getPrecisionName(type))") + + var dgemmTime: Double = 0 + let nelems = order * order + + // Initialize matrices + var A = Array(repeating: T(0), count: nelems) + var B = Array(repeating: T(0), count: nelems) + var C = Array(repeating: T(0), count: nelems) + + for i in 0..(_ type: T.Type) -> String { + switch type { + case is Float16.Type: + return "FP16" + case is Float.Type: + return "FP32" + case is Double.Type: + return "FP64" + default: + return "Unknown" + } +} + +// Helper function to get appropriate epsilon based on precision +func getSizeBasedEpsilon(_ type: T.Type) -> Double { + switch type { + case is Float16.Type: + return 1.0e-3 // Relaxed for 16-bit + case is Float.Type: + return 1.0e-6 // Standard for 32-bit + case is Double.Type: + return 1.0e-8 // Strict for 64-bit + default: + return 1.0e-6 + } +} + +func main() { + print("Parallel Research Kernels") + print("Swift Dense matrix-matrix multiplication: C += A x B (Multi-precision)") + + let arguments = CommandLine.arguments + guard arguments.count >= 3 else { + print("Usage: \(arguments[0]) <# iterations> ") + exit(1) + } + + guard let iterations = Int(arguments[1]), iterations >= 1 else { + print("ERROR: iterations must be >= 1") + exit(1) + } + + guard let order = Int(arguments[2]), order > 0 else { + print("ERROR: Matrix Order must be greater than 0") + exit(1) + } + + guard order <= 2000 else { + print("ERROR: matrix dimension too large - overflow risk") + exit(1) + } + + print("Number of iterations = \(iterations)") + print("Matrix order = \(order)") + + // Test all supported precision types + // Skip Float16 for now due to segfault issues + print("Float16 temporarily disabled due to implementation issues") + + runBenchmark(type: Float.self, iterations: iterations, order: order) + runBenchmark(type: Double.self, iterations: iterations, order: order) +} + +main()