diff --git a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp index 466adf216..e4ac7ae01 100644 --- a/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp +++ b/Sources/Overload/OvRendering/src/OvRendering/Resources/Mesh.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -83,36 +84,102 @@ void OvRendering::Resources::Mesh::Upload(std::span p_ve void OvRendering::Resources::Mesh::ComputeBoundingSphere(std::span p_vertices) { - m_boundingSphere.position = OvMaths::FVector3::Zero; - m_boundingSphere.radius = 0.0f; + const size_t vertexCount = p_vertices.size(); - if (!p_vertices.empty()) + if (vertexCount == 0) { - float minX = std::numeric_limits::max(); - float minY = std::numeric_limits::max(); - float minZ = std::numeric_limits::max(); + m_boundingSphere = { + .position = OvMaths::FVector3::Zero, + .radius = 0.0f + }; - float maxX = std::numeric_limits::min(); - float maxY = std::numeric_limits::min(); - float maxZ = std::numeric_limits::min(); + return; + } - for (const auto& vertex : p_vertices) - { - minX = std::min(minX, vertex.position[0]); - minY = std::min(minY, vertex.position[1]); - minZ = std::min(minZ, vertex.position[2]); + // Initialize SIMD registers for min/max with first vertex values + __m128 vMinXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], FLT_MAX); + __m128 vMaxXYZ = _mm_setr_ps(p_vertices[0].position[0], p_vertices[0].position[1], p_vertices[0].position[2], -FLT_MAX); - maxX = std::max(maxX, vertex.position[0]); - maxY = std::max(maxY, vertex.position[1]); - maxZ = std::max(maxZ, vertex.position[2]); - } + // Process all vertices in one loop to find min/max + for (size_t i = 1; i < vertexCount; ++i) + { + // Load vertex position directly - assumes position is aligned properly + const float* posPtr = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(posPtr); // Using loadu in case it's not 16-byte aligned - m_boundingSphere.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2.0f; + // Update min and max in one pass + vMinXYZ = _mm_min_ps(vMinXYZ, vPos); + vMaxXYZ = _mm_max_ps(vMaxXYZ, vPos); + } - for (const auto& vertex : p_vertices) - { - const auto& position = reinterpret_cast(vertex.position); - m_boundingSphere.radius = std::max(m_boundingSphere.radius, OvMaths::FVector3::Distance(m_boundingSphere.position, position)); - } + // Calculate center = (min + max) * 0.5 + __m128 vCenter = _mm_mul_ps(_mm_add_ps(vMinXYZ, vMaxXYZ), _mm_set1_ps(0.5f)); + + // Store center position + float centerArr[4]; + _mm_store_ps(centerArr, vCenter); + auto center = OvMaths::FVector3{ centerArr[0], centerArr[1], centerArr[2] }; + + // Calculate radius - use dot product for distance calculation + __m128 vMaxDistSq = _mm_setzero_ps(); + + // Pre-load center vector once outside the loop + const __m128 vCenterXYZ = _mm_setr_ps( + center.x, + center.y, + center.z, + 0.0f + ); + + // Unroll the loop by 4 for better throughput + size_t i = 0; + const size_t unrollCount = vertexCount & ~3ull; // Round down to multiple of 4 + + for (; i < unrollCount; i += 4) + { + // Load 4 vertices at once + const float* pos0 = p_vertices[i].position; + const float* pos1 = p_vertices[i + 1].position; + const float* pos2 = p_vertices[i + 2].position; + const float* pos3 = p_vertices[i + 3].position; + + __m128 vPos0 = _mm_loadu_ps(pos0); + __m128 vDiff0 = _mm_sub_ps(vPos0, vCenterXYZ); + __m128 vDistSq0 = _mm_dp_ps(vDiff0, vDiff0, 0x77); // Dot product with mask 0x77 (sum xyz, store in all) + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq0); + + __m128 vPos1 = _mm_loadu_ps(pos1); + __m128 vDiff1 = _mm_sub_ps(vPos1, vCenterXYZ); + __m128 vDistSq1 = _mm_dp_ps(vDiff1, vDiff1, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq1); + + __m128 vPos2 = _mm_loadu_ps(pos2); + __m128 vDiff2 = _mm_sub_ps(vPos2, vCenterXYZ); + __m128 vDistSq2 = _mm_dp_ps(vDiff2, vDiff2, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq2); + + __m128 vPos3 = _mm_loadu_ps(pos3); + __m128 vDiff3 = _mm_sub_ps(vPos3, vCenterXYZ); + __m128 vDistSq3 = _mm_dp_ps(vDiff3, vDiff3, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq3); } + + // Handle remaining vertices + for (; i < vertexCount; ++i) + { + const float* pos = p_vertices[i].position; + __m128 vPos = _mm_loadu_ps(pos); + __m128 vDiff = _mm_sub_ps(vPos, vCenterXYZ); + __m128 vDistSq = _mm_dp_ps(vDiff, vDiff, 0x77); + vMaxDistSq = _mm_max_ps(vMaxDistSq, vDistSq); + } + + // Extract radius (sqrt of max squared distance) + float maxDistSq; + _mm_store_ss(&maxDistSq, vMaxDistSq); + + m_boundingSphere = { + .position = center, + .radius = std::sqrt(maxDistSq) + }; }