@@ -82,159 +82,104 @@ void OvRendering::Resources::Mesh::Upload(std::span<const Geometry::Vertex> p_ve
8282 }
8383}
8484
85- namespace
85+ void OvRendering::Resources::Mesh::ComputeBoundingSphere (std::span< const Geometry::Vertex> p_vertices)
8686{
87- OvRendering::Geometry::BoundingSphere ComputeBoundingSphereSIMD (std::span<const OvRendering::Geometry::Vertex> p_vertices)
88- {
89- const size_t vertexCount = p_vertices.size ();
90-
91- if (vertexCount == 0 )
92- {
93- return {
94- .position = OvMaths::FVector3::Zero,
95- .radius = 0 .0f
96- };
97- }
87+ const size_t vertexCount = p_vertices.size ();
9888
99- // Initialize SIMD registers for min/max with first vertex values
100- __m128 vMinXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], FLT_MAX);
101- __m128 vMaxXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], -FLT_MAX);
89+ if (vertexCount == 0 )
90+ {
91+ m_boundingSphere = {
92+ .position = OvMaths::FVector3::Zero,
93+ .radius = 0 .0f
94+ };
10295
103- // Process all vertices in one loop to find min/max
104- for (size_t i = 1 ; i < vertexCount; ++i)
105- {
106- // Load vertex position directly - assumes position is aligned properly
107- const float * posPtr = p_vertices[i].position ;
108- __m128 vPos = _mm_loadu_ps (posPtr); // Using loadu in case it's not 16-byte aligned
96+ return ;
97+ }
10998
110- // Update min and max in one pass
111- vMinXYZ = _mm_min_ps (vMinXYZ, vPos);
112- vMaxXYZ = _mm_max_ps (vMaxXYZ, vPos);
113- }
99+ // Initialize SIMD registers for min/max with first vertex values
100+ __m128 vMinXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], FLT_MAX);
101+ __m128 vMaxXYZ = _mm_setr_ps (p_vertices[0 ].position [0 ], p_vertices[0 ].position [1 ], p_vertices[0 ].position [2 ], -FLT_MAX);
114102
115- // Calculate center = (min + max) * 0.5
116- __m128 vCenter = _mm_mul_ps (_mm_add_ps (vMinXYZ, vMaxXYZ), _mm_set1_ps (0 .5f ));
103+ // Process all vertices in one loop to find min/max
104+ for (size_t i = 1 ; i < vertexCount; ++i)
105+ {
106+ // Load vertex position directly - assumes position is aligned properly
107+ const float * posPtr = p_vertices[i].position ;
108+ __m128 vPos = _mm_loadu_ps (posPtr); // Using loadu in case it's not 16-byte aligned
117109
118- // Store center position
119- float centerArr[ 4 ] ;
120- _mm_store_ps (centerArr, vCenter );
121- auto center = OvMaths::FVector3{ centerArr[ 0 ], centerArr[ 1 ], centerArr[ 2 ] };
110+ // Update min and max in one pass
111+ vMinXYZ = _mm_min_ps (vMinXYZ, vPos) ;
112+ vMaxXYZ = _mm_max_ps (vMaxXYZ, vPos );
113+ }
122114
123- // Calculate radius - use dot product for distance calculation
124- __m128 vMaxDistSq = _mm_setzero_ps ( );
115+ // Calculate center = (min + max) * 0.5
116+ __m128 vCenter = _mm_mul_ps ( _mm_add_ps (vMinXYZ, vMaxXYZ), _mm_set1_ps ( 0 . 5f ) );
125117
126- // Pre-load center vector once outside the loop
127- const __m128 vCenterXYZ = _mm_setr_ps (
128- center.x ,
129- center.y ,
130- center.z ,
131- 0 .0f
132- );
118+ // Store center position
119+ float centerArr[4 ];
120+ _mm_store_ps (centerArr, vCenter);
121+ auto center = OvMaths::FVector3{ centerArr[0 ], centerArr[1 ], centerArr[2 ] };
133122
134- // Unroll the loop by 4 for better throughput
135- size_t i = 0 ;
136- const size_t unrollCount = vertexCount & ~3ull ; // Round down to multiple of 4
123+ // Calculate radius - use dot product for distance calculation
124+ __m128 vMaxDistSq = _mm_setzero_ps ();
137125
138- for (; i < unrollCount; i += 4 )
139- {
140- // Load 4 vertices at once
141- const float * pos0 = p_vertices[i].position ;
142- const float * pos1 = p_vertices[i + 1 ].position ;
143- const float * pos2 = p_vertices[i + 2 ].position ;
144- const float * pos3 = p_vertices[i + 3 ].position ;
145-
146- __m128 vPos0 = _mm_loadu_ps (pos0);
147- __m128 vDiff0 = _mm_sub_ps (vPos0, vCenterXYZ);
148- __m128 vDistSq0 = _mm_dp_ps (vDiff0, vDiff0, 0x77 ); // Dot product with mask 0x77 (sum xyz, store in all)
149- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq0);
150-
151- __m128 vPos1 = _mm_loadu_ps (pos1);
152- __m128 vDiff1 = _mm_sub_ps (vPos1, vCenterXYZ);
153- __m128 vDistSq1 = _mm_dp_ps (vDiff1, vDiff1, 0x77 );
154- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq1);
155-
156- __m128 vPos2 = _mm_loadu_ps (pos2);
157- __m128 vDiff2 = _mm_sub_ps (vPos2, vCenterXYZ);
158- __m128 vDistSq2 = _mm_dp_ps (vDiff2, vDiff2, 0x77 );
159- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq2);
160-
161- __m128 vPos3 = _mm_loadu_ps (pos3);
162- __m128 vDiff3 = _mm_sub_ps (vPos3, vCenterXYZ);
163- __m128 vDistSq3 = _mm_dp_ps (vDiff3, vDiff3, 0x77 );
164- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq3);
165- }
126+ // Pre-load center vector once outside the loop
127+ const __m128 vCenterXYZ = _mm_setr_ps (
128+ center.x ,
129+ center.y ,
130+ center.z ,
131+ 0 .0f
132+ );
166133
167- // Handle remaining vertices
168- for (; i < vertexCount; ++i)
169- {
170- const float * pos = p_vertices[i].position ;
171- __m128 vPos = _mm_loadu_ps (pos);
172- __m128 vDiff = _mm_sub_ps (vPos, vCenterXYZ);
173- __m128 vDistSq = _mm_dp_ps (vDiff, vDiff, 0x77 );
174- vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq);
175- }
134+ // Unroll the loop by 4 for better throughput
135+ size_t i = 0 ;
136+ const size_t unrollCount = vertexCount & ~3ull ; // Round down to multiple of 4
176137
177- // Extract radius (sqrt of max squared distance)
178- float maxDistSq;
179- _mm_store_ss (&maxDistSq, vMaxDistSq);
180-
181- return {
182- .position = center,
183- .radius = std::sqrt (maxDistSq)
184- };
138+ for (; i < unrollCount; i += 4 )
139+ {
140+ // Load 4 vertices at once
141+ const float * pos0 = p_vertices[i].position ;
142+ const float * pos1 = p_vertices[i + 1 ].position ;
143+ const float * pos2 = p_vertices[i + 2 ].position ;
144+ const float * pos3 = p_vertices[i + 3 ].position ;
145+
146+ __m128 vPos0 = _mm_loadu_ps (pos0);
147+ __m128 vDiff0 = _mm_sub_ps (vPos0, vCenterXYZ);
148+ __m128 vDistSq0 = _mm_dp_ps (vDiff0, vDiff0, 0x77 ); // Dot product with mask 0x77 (sum xyz, store in all)
149+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq0);
150+
151+ __m128 vPos1 = _mm_loadu_ps (pos1);
152+ __m128 vDiff1 = _mm_sub_ps (vPos1, vCenterXYZ);
153+ __m128 vDistSq1 = _mm_dp_ps (vDiff1, vDiff1, 0x77 );
154+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq1);
155+
156+ __m128 vPos2 = _mm_loadu_ps (pos2);
157+ __m128 vDiff2 = _mm_sub_ps (vPos2, vCenterXYZ);
158+ __m128 vDistSq2 = _mm_dp_ps (vDiff2, vDiff2, 0x77 );
159+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq2);
160+
161+ __m128 vPos3 = _mm_loadu_ps (pos3);
162+ __m128 vDiff3 = _mm_sub_ps (vPos3, vCenterXYZ);
163+ __m128 vDistSq3 = _mm_dp_ps (vDiff3, vDiff3, 0x77 );
164+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq3);
185165 }
186166
187- OvRendering::Geometry::BoundingSphere ComputeBoundingSphereRegular (std::span<const OvRendering::Geometry::Vertex> p_vertices)
167+ // Handle remaining vertices
168+ for (; i < vertexCount; ++i)
188169 {
189- auto result = OvRendering::Geometry::BoundingSphere{
190- .position = OvMaths::FVector3::Zero,
191- .radius = 0 .0f
192- };
193-
194- if (!p_vertices.empty ())
195- {
196- float minX = std::numeric_limits<float >::max ();
197- float minY = std::numeric_limits<float >::max ();
198- float minZ = std::numeric_limits<float >::max ();
199-
200- float maxX = std::numeric_limits<float >::min ();
201- float maxY = std::numeric_limits<float >::min ();
202- float maxZ = std::numeric_limits<float >::min ();
203-
204- for (const auto & vertex : p_vertices)
205- {
206- minX = std::min (minX, vertex.position [0 ]);
207- minY = std::min (minY, vertex.position [1 ]);
208- minZ = std::min (minZ, vertex.position [2 ]);
209-
210- maxX = std::max (maxX, vertex.position [0 ]);
211- maxY = std::max (maxY, vertex.position [1 ]);
212- maxZ = std::max (maxZ, vertex.position [2 ]);
213- }
214-
215- result.position = OvMaths::FVector3{ minX + maxX, minY + maxY, minZ + maxZ } / 2 .0f ;
216-
217- for (const auto & vertex : p_vertices)
218- {
219- const auto & position = reinterpret_cast <const OvMaths::FVector3&>(vertex.position );
220- result.radius = std::max (result.radius , OvMaths::FVector3::Distance (result.position , position));
221- }
222- }
223-
224- return result;
170+ const float * pos = p_vertices[i].position ;
171+ __m128 vPos = _mm_loadu_ps (pos);
172+ __m128 vDiff = _mm_sub_ps (vPos, vCenterXYZ);
173+ __m128 vDistSq = _mm_dp_ps (vDiff, vDiff, 0x77 );
174+ vMaxDistSq = _mm_max_ps (vMaxDistSq, vDistSq);
225175 }
226- }
227176
228- void OvRendering::Resources::Mesh::ComputeBoundingSphere (std::span< const Geometry::Vertex> p_vertices )
229- {
230- constexpr bool useSIMD = true ;
177+ // Extract radius (sqrt of max squared distance )
178+ float maxDistSq;
179+ _mm_store_ss (&maxDistSq, vMaxDistSq) ;
231180
232- if constexpr (useSIMD)
233- {
234- m_boundingSphere = ComputeBoundingSphereSIMD (p_vertices);
235- }
236- else
237- {
238- m_boundingSphere = ComputeBoundingSphereRegular (p_vertices);
239- }
181+ m_boundingSphere = {
182+ .position = center,
183+ .radius = std::sqrt (maxDistSq)
184+ };
240185}
0 commit comments