@@ -530,7 +530,7 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
530530 uint b = blocks[i];
531531 uint weight = (uint)(math::clamp<uint>(endpoint_weight * m_block_weights[b], 1 , 2048 ) * encoding_weight[m_block_encodings[b]]);
532532 uint32 selector = 0 ;
533- for (uint sh = 0 , p = 0 ; p < 16 ; p++, sh += 2 ) {
533+ for (uint p = 0 ; p < 16 ; p++) {
534534 uint error_best = cUINT32_MAX;
535535 uint8 s_best = 0 ;
536536 for (uint8 t = 0 ; t < 4 ; t++) {
@@ -541,9 +541,9 @@ void dxt_hc::determine_color_endpoint_codebook_task(uint64 data, void*) {
541541 error_best = error;
542542 }
543543 }
544- selector |= s_best << sh ;
544+ selector = selector << 2 | s_best ;
545545 }
546- m_block_selectors[cColor][b] = selector | (uint64)weight << 32 ;
546+ m_block_selectors[cColor][b] = (uint64)selector << 32 | weight ;
547547 }
548548
549549 dxt_endpoint_refiner::params refinerParams;
@@ -609,7 +609,7 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
609609 uint b = blocks[i];
610610 uint weight = (uint)(math::clamp<uint>(0x8000 * endpoint_weight * m_block_weights[b] * (m_block_encodings[b] ? 0 .972f : 1 .0f ), 1 , 0xFFFF ));
611611 uint32 selector = 0 ;
612- for (uint sh = 0 , p = 0 ; p < 8 ; p++, sh += 2 ) {
612+ for (uint p = 0 ; p < 8 ; p++) {
613613 uint error_best = cUINT32_MAX;
614614 uint8 s_best = 0 ;
615615 for (uint8 s = 0 ; s < 4 ; s++) {
@@ -619,9 +619,9 @@ void dxt_hc::determine_color_endpoint_codebook_task_etc(uint64 data, void*) {
619619 error_best = error;
620620 }
621621 }
622- selector |= s_best << sh ;
622+ selector = selector << 2 | s_best ;
623623 }
624- m_block_selectors[cColor][b] = selector | (uint64)weight << 32 ;
624+ m_block_selectors[cColor][b] = (uint64)selector << ((b & 1 ) ? 32 : 48 ) | weight ;
625625 }
626626 }
627627 }
@@ -663,13 +663,59 @@ void dxt_hc::determine_color_endpoint_clusters_task(uint64 data, void* pData_ptr
663663}
664664
665665void dxt_hc::determine_color_endpoints () {
666- tree_clusterizer<vec6F> vq;
666+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
667+ crnlib::vector<std::pair<vec6F, uint> > endpoints;
667668 for (uint t = 0 ; t < m_tiles.size (); t++) {
668669 if (m_tiles[t].pixels .size ())
669- vq. add_training_vec ( m_tiles[t].color_endpoint , (uint)(m_tiles[t].pixels .size () * m_tiles[t].weight ));
670+ endpoints. push_back ( std::make_pair ( m_tiles[t].color_endpoint , (uint)(m_tiles[t].pixels .size () * m_tiles[t].weight ) ));
670671 }
671672
672- vq.generate_codebook (math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size ), true , m_pTask_pool);
673+ struct Node {
674+ std::pair<vec6F, uint> *p, *pEnd;
675+ Node (std::pair<vec6F, uint>* begin, std::pair<vec6F, uint>* end) : p(begin), pEnd(end) {}
676+ bool operator <(const Node& other) const { return *p > *other.p ; }
677+ static void sort_task (uint64 data, void * ptr) { std::sort (((Node*)ptr)->p , ((Node*)ptr)->pEnd ); }
678+ };
679+
680+ crnlib::vector<Node> nodes;
681+ Node node (0 , endpoints.get_ptr ());
682+ for (uint i = 0 ; i < num_tasks; i++) {
683+ node.p = node.pEnd ;
684+ node.pEnd = endpoints.get_ptr () + endpoints.size () * (i + 1 ) / num_tasks;
685+ if (node.p != node.pEnd )
686+ nodes.push_back (node);
687+ }
688+
689+ for (uint i = 0 ; i < nodes.size (); i++)
690+ m_pTask_pool->queue_task (&Node::sort_task, i, &nodes[i]);
691+ m_pTask_pool->join ();
692+
693+ std::priority_queue<Node> queue;
694+ for (uint i = 0 ; i < nodes.size (); i++)
695+ queue.push (nodes[i]);
696+
697+ crnlib::vector<vec6F> vectors;
698+ crnlib::vector<uint> weights;
699+ vectors.reserve (endpoints.size ());
700+ weights.reserve (endpoints.size ());
701+ while (queue.size ()) {
702+ Node node = queue.top ();
703+ std::pair<vec6F, uint>* endpoint = node.p ++;
704+ queue.pop ();
705+ if (node.p != node.pEnd )
706+ queue.push (node);
707+ if (!vectors.size () || endpoint->first != vectors.back ()) {
708+ vectors.push_back (endpoint->first );
709+ weights.push_back (endpoint->second );
710+ } else if (weights.back () > UINT_MAX - endpoint->second ) {
711+ weights.back () = UINT_MAX;
712+ } else {
713+ weights.back () += endpoint->second ;
714+ }
715+ }
716+
717+ tree_clusterizer<vec6F> vq;
718+ vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), math::minimum<uint>(m_num_tiles, m_params.m_color_endpoint_codebook_size ), true , m_pTask_pool);
673719 m_color_clusters.resize (vq.get_codebook_size ());
674720
675721 for (uint i = 0 ; i <= m_pTask_pool->get_num_threads (); i++)
@@ -757,7 +803,7 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
757803 uint b = blocks[i];
758804 uint weight = encoding_weight[m_block_encodings[b]];
759805 uint64 selector = 0 ;
760- for (uint sh = 0 , p = 0 ; p < 16 ; p++, sh += 3 ) {
806+ for (uint p = 0 ; p < 16 ; p++) {
761807 uint error_best = cUINT32_MAX;
762808 uint8 s_best = 0 ;
763809 for (uint8 t = 0 ; t < 8 ; t++) {
@@ -769,9 +815,9 @@ void dxt_hc::determine_alpha_endpoint_codebook_task(uint64 data, void*) {
769815 error_best = error;
770816 }
771817 }
772- selector |= (uint64)s_best << sh ;
818+ selector = selector << 3 | s_best ;
773819 }
774- m_block_selectors[cAlpha0 + a][b] = selector | (uint64)weight << 48 ;
820+ m_block_selectors[cAlpha0 + a][b] = selector << 16 | weight ;
775821 }
776822 }
777823
@@ -823,18 +869,64 @@ void dxt_hc::determine_alpha_endpoint_clusters_task(uint64 data, void* pData_ptr
823869}
824870
825871void dxt_hc::determine_alpha_endpoints () {
826- tree_clusterizer<vec2F> vq;
872+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
873+ crnlib::vector<std::pair<vec2F, uint> > endpoints;
827874 for (uint a = 0 ; a < m_num_alpha_blocks; a++) {
828875 for (uint t = 0 ; t < m_tiles.size (); t++) {
829876 if (m_tiles[t].pixels .size ())
830- vq. add_training_vec ( m_tiles[t].alpha_endpoints [a], m_tiles[t].pixels .size ());
877+ endpoints. push_back ( std::make_pair ( m_tiles[t].alpha_endpoints [a], m_tiles[t].pixels .size () ));
831878 }
832879 }
833880
834- vq.generate_codebook (math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size ), false , m_pTask_pool);
881+ struct Node {
882+ std::pair<vec2F, uint> *p, *pEnd;
883+ Node (std::pair<vec2F, uint>* begin, std::pair<vec2F, uint>* end) : p(begin), pEnd(end) {}
884+ bool operator <(const Node& other) const { return *p > *other.p ; }
885+ static void sort_task (uint64 data, void * ptr) { std::sort (((Node*)ptr)->p , ((Node*)ptr)->pEnd ); }
886+ };
887+
888+ crnlib::vector<Node> nodes;
889+ Node node (0 , endpoints.get_ptr ());
890+ for (uint i = 0 ; i < num_tasks; i++) {
891+ node.p = node.pEnd ;
892+ node.pEnd = endpoints.get_ptr () + endpoints.size () * (i + 1 ) / num_tasks;
893+ if (node.p != node.pEnd )
894+ nodes.push_back (node);
895+ }
896+
897+ for (uint i = 0 ; i < nodes.size (); i++)
898+ m_pTask_pool->queue_task (&Node::sort_task, i, &nodes[i]);
899+ m_pTask_pool->join ();
900+
901+ std::priority_queue<Node> queue;
902+ for (uint i = 0 ; i < nodes.size (); i++)
903+ queue.push (nodes[i]);
904+
905+ crnlib::vector<vec2F> vectors;
906+ crnlib::vector<uint> weights;
907+ vectors.reserve (endpoints.size ());
908+ weights.reserve (endpoints.size ());
909+ while (queue.size ()) {
910+ Node node = queue.top ();
911+ std::pair<vec2F, uint>* endpoint = node.p ++;
912+ queue.pop ();
913+ if (node.p != node.pEnd )
914+ queue.push (node);
915+ if (!vectors.size () || endpoint->first != vectors.back ()) {
916+ vectors.push_back (endpoint->first );
917+ weights.push_back (endpoint->second );
918+ } else if (weights.back () > UINT_MAX - endpoint->second ) {
919+ weights.back () = UINT_MAX;
920+ } else {
921+ weights.back () += endpoint->second ;
922+ }
923+ }
924+
925+ tree_clusterizer<vec2F> vq;
926+ vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), math::minimum<uint>(m_num_tiles, m_params.m_alpha_endpoint_codebook_size ), false , m_pTask_pool);
835927 m_alpha_clusters.resize (vq.get_codebook_size ());
836928
837- for (uint i = 0 ; i <= m_pTask_pool-> get_num_threads () ; i++)
929+ for (uint i = 0 ; i < num_tasks ; i++)
838930 m_pTask_pool->queue_object_task (this , &dxt_hc::determine_alpha_endpoint_clusters_task, i, &vq);
839931 m_pTask_pool->join ();
840932
@@ -859,7 +951,7 @@ void dxt_hc::determine_alpha_endpoints() {
859951 }
860952 }
861953
862- for (uint i = 0 ; i <= m_pTask_pool-> get_num_threads () ; i++)
954+ for (uint i = 0 ; i < num_tasks ; i++)
863955 m_pTask_pool->queue_object_task (this , &dxt_hc::determine_alpha_endpoint_codebook_task, i, NULL );
864956 m_pTask_pool->join ();
865957}
@@ -911,16 +1003,68 @@ void dxt_hc::create_color_selector_codebook_task(uint64 data, void* pData_ptr) {
9111003 }
9121004}
9131005
1006+ struct SelectorNode {
1007+ uint64 *p, *pEnd;
1008+ SelectorNode (uint64* begin, uint64* end) : p(begin), pEnd(end) {}
1009+ bool operator <(const SelectorNode& other) const { return *p > *other.p ; }
1010+ static void sort_task (uint64 data, void * ptr) { std::sort (((SelectorNode*)ptr)->p , ((SelectorNode*)ptr)->pEnd ); }
1011+ };
1012+
9141013void dxt_hc::create_color_selector_codebook () {
915- tree_clusterizer<vec16F> selector_vq;
916- vec16F v;
917- for (uint n = m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks, b = 0 ; b < n; b++) {
918- uint64 selector = m_has_etc_color_blocks ? m_block_selectors[cColor][b << 1 ] | m_block_selectors[cColor][b << 1 | 1 ] << 16 : m_block_selectors[cColor][b];
919- for (uint8 p = 0 ; p < 16 ; p++, selector >>= 2 )
920- v[p] = ((selector & 3 ) + 0 .5f ) * 0 .25f ;
921- selector_vq.add_training_vec (v, m_has_etc_color_blocks ? (selector & 0xFFFF ) + (selector >> 16 ) : selector);
1014+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
1015+ crnlib::vector<uint64> selectors (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks);
1016+ for (uint i = 0 , b = 0 , step = m_has_etc_color_blocks ? 2 : 1 ; b < m_num_blocks; b += step)
1017+ selectors[i++] = m_block_selectors[cColor][b] + (m_has_etc_color_blocks ? m_block_selectors[cColor][b + 1 ] : 0 );
1018+
1019+ crnlib::vector<SelectorNode> nodes;
1020+ SelectorNode node (0 , selectors.get_ptr ());
1021+ for (uint i = 0 ; i < num_tasks; i++) {
1022+ node.p = node.pEnd ;
1023+ node.pEnd = selectors.get_ptr () + selectors.size () * (i + 1 ) / num_tasks;
1024+ if (node.p != node.pEnd )
1025+ nodes.push_back (node);
9221026 }
923- selector_vq.generate_codebook (m_params.m_color_selector_codebook_size , false , m_pTask_pool);
1027+
1028+ for (uint i = 0 ; i < nodes.size (); i++)
1029+ m_pTask_pool->queue_task (&SelectorNode::sort_task, i, &nodes[i]);
1030+ m_pTask_pool->join ();
1031+
1032+ std::priority_queue<SelectorNode> queue;
1033+ for (uint i = 0 ; i < nodes.size (); i++)
1034+ queue.push (nodes[i]);
1035+
1036+ float v[4 ];
1037+ for (uint s = 0 ; s < 4 ; s++)
1038+ v[s] = (s + 0 .5f ) * 0 .25f ;
1039+
1040+ crnlib::vector<vec16F> vectors;
1041+ crnlib::vector<uint> weights;
1042+ vectors.reserve (selectors.size ());
1043+ weights.reserve (selectors.size ());
1044+ for (uint64 prev_selector = 0 ; queue.size ();) {
1045+ SelectorNode node = queue.top ();
1046+ uint64 selector = *node.p ++;
1047+ queue.pop ();
1048+ if (node.p != node.pEnd )
1049+ queue.push (node);
1050+ uint weight = (uint)selector;
1051+ selector >>= 32 ;
1052+ if (!vectors.size () || selector != prev_selector) {
1053+ prev_selector = selector;
1054+ vec16F vector;
1055+ for (uint p = 0 ; p < 16 ; p++, selector >>= 2 )
1056+ vector[15 - p] = v[selector & 3 ];
1057+ vectors.push_back (vector);
1058+ weights.push_back (weight);
1059+ } else if (weights.back () > UINT_MAX - weight) {
1060+ weights.back () = UINT_MAX;
1061+ } else {
1062+ weights.back () += weight;
1063+ }
1064+ }
1065+
1066+ tree_clusterizer<vec16F> selector_vq;
1067+ selector_vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), m_params.m_color_selector_codebook_size , false , m_pTask_pool);
9241068 m_color_selectors.resize (selector_vq.get_codebook_size ());
9251069 m_color_selectors_used.resize (selector_vq.get_codebook_size ());
9261070 for (uint i = 0 ; i < selector_vq.get_codebook_size (); i++) {
@@ -930,7 +1074,6 @@ void dxt_hc::create_color_selector_codebook() {
9301074 m_color_selectors[i] |= (uint)(v[j] * 4 .0f ) << sh;
9311075 }
9321076
933- uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
9341077 crnlib::vector<crnlib::vector<color_selector_details> > selector_details (num_tasks);
9351078 for (uint t = 0 ; t < num_tasks; t++) {
9361079 selector_details[t].resize (m_color_selectors.size ());
@@ -1024,17 +1167,62 @@ void dxt_hc::create_alpha_selector_codebook_task(uint64 data, void* pData_ptr) {
10241167}
10251168
10261169void dxt_hc::create_alpha_selector_codebook () {
1027- tree_clusterizer<vec16F> selector_vq;
1028- vec16F v;
1029- for (uint c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1030- for (uint b = 0 ; b < m_num_blocks; b += m_has_etc_color_blocks ? 2 : 1 ) {
1031- uint64 selector = m_block_selectors[c][b];
1032- for (uint8 p = 0 ; p < 16 ; p++, selector >>= 3 )
1033- v[p] = ((selector & 7 ) + 0 .5f ) * 0 .125f ;
1034- selector_vq.add_training_vec (v, selector);
1170+ uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
1171+ crnlib::vector<uint64> selectors (m_num_alpha_blocks * (m_has_etc_color_blocks ? m_num_blocks >> 1 : m_num_blocks));
1172+ for (uint i = 0 , c = cAlpha0; c < cAlpha0 + m_num_alpha_blocks; c++) {
1173+ for (uint b = 0 , step = m_has_etc_color_blocks ? 2 : 1 ; b < m_num_blocks; b += step)
1174+ selectors[i++] = m_block_selectors[c][b];
1175+ }
1176+
1177+ crnlib::vector<SelectorNode> nodes;
1178+ SelectorNode node (0 , selectors.get_ptr ());
1179+ for (uint i = 0 ; i < num_tasks; i++) {
1180+ node.p = node.pEnd ;
1181+ node.pEnd = selectors.get_ptr () + selectors.size () * (i + 1 ) / num_tasks;
1182+ if (node.p != node.pEnd )
1183+ nodes.push_back (node);
1184+ }
1185+
1186+ for (uint i = 0 ; i < nodes.size (); i++)
1187+ m_pTask_pool->queue_task (&SelectorNode::sort_task, i, &nodes[i]);
1188+ m_pTask_pool->join ();
1189+
1190+ std::priority_queue<SelectorNode> queue;
1191+ for (uint i = 0 ; i < nodes.size (); i++)
1192+ queue.push (nodes[i]);
1193+
1194+ float v[8 ];
1195+ for (uint s = 0 ; s < 8 ; s++)
1196+ v[s] = (s + 0 .5f ) * 0 .125f ;
1197+
1198+ crnlib::vector<vec16F> vectors;
1199+ crnlib::vector<uint> weights;
1200+ vectors.reserve (selectors.size ());
1201+ weights.reserve (selectors.size ());
1202+ for (uint64 prev_selector = 0 ; queue.size ();) {
1203+ SelectorNode node = queue.top ();
1204+ uint64 selector = *node.p ++;
1205+ queue.pop ();
1206+ if (node.p != node.pEnd )
1207+ queue.push (node);
1208+ uint weight = (uint16)selector;
1209+ selector >>= 16 ;
1210+ if (!vectors.size () || selector != prev_selector) {
1211+ prev_selector = selector;
1212+ vec16F vector;
1213+ for (uint p = 0 ; p < 16 ; p++, selector >>= 3 )
1214+ vector[15 - p] = v[selector & 7 ];
1215+ vectors.push_back (vector);
1216+ weights.push_back (weight);
1217+ } else if (weights.back () > UINT_MAX - weight) {
1218+ weights.back () = UINT_MAX;
1219+ } else {
1220+ weights.back () += weight;
10351221 }
10361222 }
1037- selector_vq.generate_codebook (m_params.m_alpha_selector_codebook_size , false , m_pTask_pool);
1223+
1224+ tree_clusterizer<vec16F> selector_vq;
1225+ selector_vq.generate_codebook (vectors.get_ptr (), weights.get_ptr (), vectors.size (), m_params.m_alpha_selector_codebook_size , false , m_pTask_pool);
10381226 m_alpha_selectors.resize (selector_vq.get_codebook_size ());
10391227 m_alpha_selectors_used.resize (selector_vq.get_codebook_size ());
10401228 for (uint i = 0 ; i < selector_vq.get_codebook_size (); i++) {
@@ -1044,7 +1232,6 @@ void dxt_hc::create_alpha_selector_codebook() {
10441232 m_alpha_selectors[i] |= (uint64)(v[j] * 8 .0f ) << sh;
10451233 }
10461234
1047- uint num_tasks = m_pTask_pool->get_num_threads () + 1 ;
10481235 crnlib::vector<crnlib::vector<alpha_selector_details> > selector_details (num_tasks);
10491236 for (uint t = 0 ; t < num_tasks; t++) {
10501237 selector_details[t].resize (m_alpha_selectors.size ());
0 commit comments