Skip to content

Commit 21eb70b

Browse files
Optimize tile computation step
This change improves the compression speed for both DXT and ETC encodings. Explanation: In the tile computation step, pixels within the tiling area are palettized using a general purpose tree clusterization algorithm. At the same time, clusterization of the tile pixels is always performed with the following restrictions: the maximum number of palettized pixels is 64, the maximum number of clusters is 2. The performance can therefore be improved by solving the palettizing task with a specialized version of the tree clusterizer, which does not maintain the tree structure and uses constant memory. DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.863 sec Modified: 1468204 bytes / 5.726 sec Improvement: 7.21% (compression ratio) / 80.16% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.950 sec Modified: 1914805 bytes / 7.683 sec Improvement: 7.28% (compression ratio) / 79.21% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 13.071 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
1 parent b8b456d commit 21eb70b

File tree

4 files changed

+203
-65
lines changed

4 files changed

+203
-65
lines changed

bin/crunch_x64.exe

-16.5 KB
Binary file not shown.

crnlib/crn_dxt_hc.cpp

Lines changed: 56 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,56 @@ bool dxt_hc::compress(
231231
return true;
232232
}
233233

234+
vec6F dxt_hc::palettize_color(color_quad_u8* pixels, uint pixels_count) {
235+
uint color[64];
236+
for (uint i = 0; i < pixels_count; i++)
237+
color[i] = pixels[i][0] << 16 | pixels[i][1] << 8 | pixels[i][2];
238+
std::sort(color, color + pixels_count);
239+
vec3F vectors[64];
240+
uint weights[64];
241+
uint size = 0;
242+
for (uint i = 0; i < pixels_count; i++) {
243+
if (!i || color[i] != color[i - 1]) {
244+
vectors[size][0] = m_params.m_perceptual ? m_uint8_to_float[color[i] >> 16] * 0.5f : m_uint8_to_float[color[i] >> 16];
245+
vectors[size][1] = m_uint8_to_float[color[i] >> 8 & 0xFF];
246+
vectors[size][2] = m_params.m_perceptual ? m_uint8_to_float[color[i] & 0xFF] * 0.25f : m_uint8_to_float[color[i] & 0xFF];
247+
weights[size] = 1;
248+
size++;
249+
} else {
250+
weights[size - 1]++;
251+
}
252+
}
253+
vec3F result[2];
254+
split_vectors<vec3F>(vectors, weights, size, result);
255+
if (result[0].length() > result[1].length())
256+
utils::swap(result[0], result[1]);
257+
return *(vec6F*)result;
258+
}
259+
260+
vec2F dxt_hc::palettize_alpha(color_quad_u8* pixels, uint pixels_count, uint comp_index) {
261+
uint8 alpha[64];
262+
for (uint p = 0; p < pixels_count; p++)
263+
alpha[p] = pixels[p][comp_index];
264+
std::sort(alpha, alpha + pixels_count);
265+
vec1F vectors[64];
266+
uint weights[64];
267+
uint size = 0;
268+
for (uint i = 0; i < pixels_count; i++) {
269+
if (!i || alpha[i] != alpha[i - 1]) {
270+
vectors[size][0] = m_uint8_to_float[alpha[i]];
271+
weights[size] = 1;
272+
size++;
273+
} else {
274+
weights[size - 1]++;
275+
}
276+
}
277+
vec1F result[2];
278+
split_vectors<vec1F>(vectors, weights, size, result);
279+
if (result[0] > result[1])
280+
utils::swap(result[0], result[1]);
281+
return *(vec2F*)result;
282+
}
283+
234284
void dxt_hc::determine_tiles_task(uint64 data, void*) {
235285
uint num_tasks = m_pTask_pool->get_num_threads() + 1;
236286
uint offsets[9] = {0, 16, 32, 48, 0, 32, 64, 96, 64};
@@ -239,8 +289,6 @@ void dxt_hc::determine_tiles_task(uint64 data, void*) {
239289
uint8 selectors[64];
240290
uint tile_error[3][9];
241291
uint total_error[3][8];
242-
tree_clusterizer<vec3F> color_palettizer;
243-
tree_clusterizer<vec1F> alpha_palettizer;
244292

245293
for (uint level = 0; level < m_params.m_num_levels; level++) {
246294
float weight = m_params.m_levels[level].m_weight;
@@ -335,33 +383,10 @@ void dxt_hc::determine_tiles_task(uint64 data, void*) {
335383
uint t = tiles[best_encoding][tile_index];
336384
tile.pixels.append(tilePixels + offsets[t], 16 << (t >> 2));
337385
tile.weight = weight;
338-
339-
if (m_has_color_blocks) {
340-
color_palettizer.clear();
341-
for (uint p = 0; p < tile.pixels.size(); p++) {
342-
const color_quad_u8& pixel = tile.pixels[p];
343-
vec3F v(m_uint8_to_float[pixel[0]], m_uint8_to_float[pixel[1]], m_uint8_to_float[pixel[2]]);
344-
color_palettizer.add_training_vec(m_params.m_perceptual ? vec3F(v[0] * 0.5f, v[1], v[2] * 0.25f): v, 1);
345-
}
346-
color_palettizer.generate_codebook(2);
347-
bool single = color_palettizer.get_codebook_size() == 1;
348-
bool reorder = !single && color_palettizer.get_codebook_entry(0).length() > color_palettizer.get_codebook_entry(1).length();
349-
for (uint t = 0, i = 0; i < 2; i++) {
350-
vec3F v = color_palettizer.get_codebook_entry(single ? 0 : reorder ? 1 - i : i);
351-
for (uint c = 0; c < 3; c++, t++)
352-
tile.color_endpoint[t] = v[c];
353-
}
354-
}
355-
356-
for (uint a = 0; a < m_num_alpha_blocks; a++) {
357-
alpha_palettizer.clear();
358-
for (uint c = m_params.m_alpha_component_indices[a], p = 0; p < tile.pixels.size(); p++)
359-
alpha_palettizer.add_training_vec(vec1F(m_uint8_to_float[tile.pixels[p][c]]), 1);
360-
alpha_palettizer.generate_codebook(2);
361-
float v[2] = {alpha_palettizer.get_codebook_entry(0)[0], alpha_palettizer.get_codebook_entry(alpha_palettizer.get_codebook_size() - 1)[0]};
362-
tile.alpha_endpoints[a][0] = math::minimum(v[0], v[1]);
363-
tile.alpha_endpoints[a][1] = math::maximum(v[0], v[1]);
364-
}
386+
if (m_has_color_blocks)
387+
tile.color_endpoint = palettize_color(tile.pixels.get_ptr(), tile.pixels.size());
388+
for (uint a = 0; a < m_num_alpha_blocks; a++)
389+
tile.alpha_endpoints[a] = palettize_alpha(tile.pixels.get_ptr(), tile.pixels.size(), m_params.m_alpha_component_indices[a]);
365390
}
366391

367392
for (uint by = 0; by < 2; by++) {
@@ -385,8 +410,6 @@ void dxt_hc::determine_tiles_task_etc(uint64 data, void*) {
385410
uint8 selectors[32];
386411
uint tile_error[5];
387412
uint total_error[3];
388-
tree_clusterizer<vec3F> color_palettizer;
389-
tree_clusterizer<vec1F> alpha_palettizer;
390413

391414
etc1_optimizer optimizer;
392415
etc1_optimizer::params params;
@@ -438,36 +461,13 @@ void dxt_hc::determine_tiles_task_etc(uint64 data, void*) {
438461
}
439462
}
440463

441-
vec2F alpha_endpoints;
442-
if (m_num_alpha_blocks) {
443-
alpha_palettizer.clear();
444-
for (uint p = 0; p < 16; p++)
445-
alpha_palettizer.add_training_vec(vec1F(m_uint8_to_float[tilePixels[p].a]), 1);
446-
alpha_palettizer.generate_codebook(2);
447-
float v[2] = {alpha_palettizer.get_codebook_entry(0)[0], alpha_palettizer.get_codebook_entry(alpha_palettizer.get_codebook_size() - 1)[0]};
448-
alpha_endpoints[0] = math::minimum(v[0], v[1]);
449-
alpha_endpoints[1] = math::maximum(v[0], v[1]);
450-
}
451-
464+
vec2F alpha_endpoints = m_num_alpha_blocks ? palettize_alpha(tilePixels, 16, 3) : vec2F(cClear);
452465
for (uint tile_index = 0, s = best_encoding + 1; s; s >>= 1, tile_index++) {
453466
tile_details& tile = m_tiles[b | tile_index];
454467
uint t = tiles[best_encoding][tile_index];
455468
tile.pixels.append(tilePixels + offsets[t], 8 << (t >> 2));
456469
tile.weight = weight;
457-
color_palettizer.clear();
458-
for (uint p = 0; p < tile.pixels.size(); p++) {
459-
const color_quad_u8& pixel = tile.pixels[p];
460-
vec3F v(m_uint8_to_float[pixel[0]], m_uint8_to_float[pixel[1]], m_uint8_to_float[pixel[2]]);
461-
color_palettizer.add_training_vec(m_params.m_perceptual ? vec3F(v[0] * 0.5f, v[1], v[2] * 0.25f) : v, 1);
462-
}
463-
color_palettizer.generate_codebook(2);
464-
bool single = color_palettizer.get_codebook_size() == 1;
465-
bool reorder = !single && color_palettizer.get_codebook_entry(0).length() > color_palettizer.get_codebook_entry(1).length();
466-
for (uint t = 0, i = 0; i < 2; i++) {
467-
vec3F v = color_palettizer.get_codebook_entry(single ? 0 : reorder ? 1 - i : i);
468-
for (uint c = 0; c < 3; c++, t++)
469-
tile.color_endpoint[t] = v[c];
470-
}
470+
tile.color_endpoint = palettize_color(tile.pixels.get_ptr(), tile.pixels.size());
471471
if (m_num_alpha_blocks)
472472
tile.alpha_endpoints[0] = alpha_endpoints;
473473
}

crnlib/crn_dxt_hc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ class dxt_hc {
186186
int m_prev_phase_index;
187187
int m_prev_percentage_complete;
188188

189+
vec<6, float> palettize_color(color_quad_u8* pixels, uint pixels_count);
190+
vec<2, float> palettize_alpha(color_quad_u8* pixels, uint pixels_count, uint comp_index);
189191
void determine_tiles_task(uint64 data, void* pData_ptr);
190192
void determine_tiles_task_etc(uint64 data, void* pData_ptr);
191193

crnlib/crn_tree_clusterizer.h

Lines changed: 145 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,6 @@ class tree_clusterizer {
2525
}
2626
};
2727

28-
void clear() {
29-
m_hist.clear();
30-
m_vectors.clear();
31-
m_vectorsInfo.clear();
32-
m_codebook.clear();
33-
m_nodes.clear();
34-
m_node_index_map.clear();
35-
}
36-
3728
void add_training_vec(const VectorType& v, uint weight) {
3829
m_hist.push_back(std::make_pair(v, weight));
3930
}
@@ -498,4 +489,149 @@ class tree_clusterizer {
498489
}
499490
};
500491

492+
template<typename VectorType>
493+
void split_vectors(VectorType (&vectors)[64], uint (&weights)[64], uint size, VectorType (&result)[2]) {
494+
VectorType weightedVectors[64];
495+
double weightedDotProducts[64];
496+
VectorType centroid(cClear);
497+
uint64 total_weight = 0;
498+
double ttsum = 0.0f;
499+
for (uint i = 0; i < size; i++) {
500+
const VectorType& v = vectors[i];
501+
const uint weight = weights[i];
502+
weightedVectors[i] = v * (float)weight;
503+
centroid += weightedVectors[i];
504+
total_weight += weight;
505+
weightedDotProducts[i] = v.dot(v) * weight;
506+
ttsum += weightedDotProducts[i];
507+
}
508+
float variance = (float)(ttsum - (centroid.dot(centroid) / total_weight));
509+
centroid *= (1.0f / total_weight);
510+
result[0] = result[1] = centroid;
511+
if (variance <= 0.0f || size == 1)
512+
return;
513+
VectorType furthest;
514+
double furthest_dist = -1.0f;
515+
for (uint i = 0; i < size; i++) {
516+
const VectorType& v = vectors[i];
517+
double dist = v.squared_distance(centroid);
518+
if (dist > furthest_dist) {
519+
furthest_dist = dist;
520+
furthest = v;
521+
}
522+
}
523+
VectorType opposite;
524+
double opposite_dist = -1.0f;
525+
for (uint i = 0; i < size; i++) {
526+
const VectorType& v = vectors[i];
527+
double dist = v.squared_distance(furthest);
528+
if (dist > opposite_dist) {
529+
opposite_dist = dist;
530+
opposite = v;
531+
}
532+
}
533+
VectorType left_child((furthest + centroid) * .5f);
534+
VectorType right_child((opposite + centroid) * .5f);
535+
if (size > 2) {
536+
const uint N = VectorType::num_elements;
537+
matrix<N, N, float> covar;
538+
covar.clear();
539+
for (uint i = 0; i < size; i++) {
540+
const VectorType& v = vectors[i] - centroid;
541+
const VectorType w = v * (float)weights[i];
542+
for (uint x = 0; x < N; x++) {
543+
for (uint y = x; y < N; y++)
544+
covar[x][y] = covar[x][y] + v[x] * w[y];
545+
}
546+
}
547+
float divider = (float)total_weight;
548+
for (uint x = 0; x < N; x++) {
549+
for (uint y = x; y < N; y++) {
550+
covar[x][y] /= divider;
551+
covar[y][x] = covar[x][y];
552+
}
553+
}
554+
VectorType axis(1.0f);
555+
for (uint iter = 0; iter < 10; iter++) {
556+
VectorType x;
557+
double max_sum = 0;
558+
for (uint i = 0; i < N; i++) {
559+
double sum = 0;
560+
for (uint j = 0; j < N; j++)
561+
sum += axis[j] * covar[i][j];
562+
x[i] = (float)sum;
563+
max_sum = i ? math::maximum(max_sum, sum) : sum;
564+
}
565+
if (max_sum != 0.0f)
566+
x *= (float)(1.0f / max_sum);
567+
axis = x;
568+
}
569+
axis.normalize();
570+
VectorType new_left_child(0.0f);
571+
VectorType new_right_child(0.0f);
572+
double left_weight = 0.0f;
573+
double right_weight = 0.0f;
574+
for (uint i = 0; i < size; i++) {
575+
const VectorType& v = vectors[i];
576+
const float weight = (float)weights[i];
577+
double t = (v - centroid) * axis;
578+
if (t < 0.0f) {
579+
new_left_child += weightedVectors[i];
580+
left_weight += weight;
581+
} else {
582+
new_right_child += weightedVectors[i];
583+
right_weight += weight;
584+
}
585+
}
586+
if ((left_weight > 0.0f) && (right_weight > 0.0f)) {
587+
left_child = new_left_child * (float)(1.0f / left_weight);
588+
right_child = new_right_child * (float)(1.0f / right_weight);
589+
}
590+
}
591+
uint64 left_weight = 0;
592+
uint64 right_weight = 0;
593+
float prev_total_variance = 1e+10f;
594+
float left_variance = 0.0f;
595+
float right_variance = 0.0f;
596+
const uint cMaxLoops = 1024;
597+
for (uint total_loops = 0; total_loops < cMaxLoops; total_loops++) {
598+
VectorType new_left_child(cClear);
599+
VectorType new_right_child(cClear);
600+
double left_ttsum = 0.0f;
601+
double right_ttsum = 0.0f;
602+
left_weight = 0;
603+
right_weight = 0;
604+
for (uint i = 0; i < size; i++) {
605+
const VectorType& v = vectors[i];
606+
double left_dist2 = left_child.squared_distance(v);
607+
double right_dist2 = right_child.squared_distance(v);
608+
if (left_dist2 < right_dist2) {
609+
new_left_child += weightedVectors[i];
610+
left_ttsum += weightedDotProducts[i];
611+
left_weight += weights[i];
612+
} else {
613+
new_right_child += weightedVectors[i];
614+
right_ttsum += weightedDotProducts[i];
615+
right_weight += weights[i];
616+
}
617+
}
618+
if ((!left_weight) || (!right_weight))
619+
return;
620+
left_variance = (float)(left_ttsum - (new_left_child.dot(new_left_child) / left_weight));
621+
right_variance = (float)(right_ttsum - (new_right_child.dot(new_right_child) / right_weight));
622+
new_left_child *= (1.0f / left_weight);
623+
new_right_child *= (1.0f / right_weight);
624+
left_child = new_left_child;
625+
right_child = new_right_child;
626+
float total_variance = left_variance + right_variance;
627+
if (total_variance < .00001f)
628+
break;
629+
if (((prev_total_variance - total_variance) / total_variance) < .00001f)
630+
break;
631+
prev_total_variance = total_variance;
632+
}
633+
result[0] = left_child;
634+
result[1] = right_child;
635+
}
636+
501637
} // namespace crnlib

0 commit comments

Comments
 (0)