Optimize tile computation step

alexander-suvorov · alexander-suvorov · commit 21eb70bc1056 · 2017-10-25T19:15:36.000+02:00
This change improves the compression speed for both DXT and ETC encodings. Explanation: In the tile computation step, pixels within the tiling area are palettized using a general purpose tree clusterization algorithm. At the same time, clusterization of the tile pixels is always performed with the following restrictions: the maximum number of palettized pixels is 64, the maximum number of clusters is 2. The performance can therefore be improved by solving the palettizing task with a specialized version of the tree clusterizer, which does not maintain the tree structure and uses constant memory. DXT Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). All the decompressed test images are identical to the images being compressed and decompressed using original version of Crunch (revision ea9b8d8). [Compressing Kodak set without mipmaps using DXT1 encoding] Original: 1582222 bytes / 28.863 sec Modified: 1468204 bytes / 5.726 sec Improvement: 7.21% (compression ratio) / 80.16% (compression time) [Compressing Kodak set with mipmaps using DXT1 encoding] Original: 2065243 bytes / 36.950 sec Modified: 1914805 bytes / 7.683 sec Improvement: 7.28% (compression ratio) / 79.21% (compression time) ETC Testing: The modified algorithm has been tested on the Kodak test set using 64-bit build with default settings (running on Windows 10, i7-4790, 3.6GHz). The ETC1 quantization parameters have been selected in such a way, so that ETC1 compression gives approximately the same average Luma PSNR as the corresponding DXT1 compression (which is equal to 34.044 dB for the Kodak test set compressed without mipmaps using DXT1 encoding and default quality settings). [Compressing Kodak set without mipmaps using ETC1 encoding] Total size: 1607858 bytes Total time: 13.071 sec Average bitrate: 1.363 bpp Average Luma PSNR: 34.050 dB
diff --git a/bin/crunch_x64.exe b/bin/crunch_x64.exe
diff --git a/crnlib/crn_dxt_hc.cpp b/crnlib/crn_dxt_hc.cpp
@@ -231,6 +231,56 @@ bool dxt_hc::compress(
   return true;
 }
 
+vec6F dxt_hc::palettize_color(color_quad_u8* pixels, uint pixels_count) {
+  uint color[64];
+  for (uint i = 0; i < pixels_count; i++)
+    color[i] = pixels[i][0] << 16 | pixels[i][1] << 8 | pixels[i][2];
+  std::sort(color, color + pixels_count);
+  vec3F vectors[64];
+  uint weights[64];
+  uint size = 0;
+  for (uint i = 0; i < pixels_count; i++) {
+    if (!i || color[i] != color[i - 1]) {
+      vectors[size][0] = m_params.m_perceptual ? m_uint8_to_float[color[i] >> 16] * 0.5f : m_uint8_to_float[color[i] >> 16];
+      vectors[size][1] = m_uint8_to_float[color[i] >> 8 & 0xFF];
+      vectors[size][2] = m_params.m_perceptual ? m_uint8_to_float[color[i] & 0xFF] * 0.25f : m_uint8_to_float[color[i] & 0xFF];
+      weights[size] = 1;
+      size++;
+    } else {
+      weights[size - 1]++;
+    }
+  }
+  vec3F result[2];
+  split_vectors<vec3F>(vectors, weights, size, result);
+  if (result[0].length() > result[1].length())
+    utils::swap(result[0], result[1]);
+  return *(vec6F*)result;
+}
+
+vec2F dxt_hc::palettize_alpha(color_quad_u8* pixels, uint pixels_count, uint comp_index) {
+  uint8 alpha[64];
+  for (uint p = 0; p < pixels_count; p++)
+    alpha[p] = pixels[p][comp_index];
+  std::sort(alpha, alpha + pixels_count);
+  vec1F vectors[64];
+  uint weights[64];
+  uint size = 0;
+  for (uint i = 0; i < pixels_count; i++) {
+    if (!i || alpha[i] != alpha[i - 1]) {
+      vectors[size][0] = m_uint8_to_float[alpha[i]];
+      weights[size] = 1;
+      size++;
+    } else {
+      weights[size - 1]++;
+    }
+  }
+  vec1F result[2];
+  split_vectors<vec1F>(vectors, weights, size, result);
+  if (result[0] > result[1])
+    utils::swap(result[0], result[1]);
+  return *(vec2F*)result;
+}
+
 void dxt_hc::determine_tiles_task(uint64 data, void*) {
   uint num_tasks = m_pTask_pool->get_num_threads() + 1;
   uint offsets[9] = {0, 16, 32, 48, 0, 32, 64, 96, 64};
@@ -239,8 +289,6 @@ void dxt_hc::determine_tiles_task(uint64 data, void*) {
   uint8 selectors[64];
   uint tile_error[3][9];
   uint total_error[3][8];
-  tree_clusterizer<vec3F> color_palettizer;
-  tree_clusterizer<vec1F> alpha_palettizer;
 
   for (uint level = 0; level < m_params.m_num_levels; level++) {
     float weight = m_params.m_levels[level].m_weight;
@@ -335,33 +383,10 @@ void dxt_hc::determine_tiles_task(uint64 data, void*) {
           uint t = tiles[best_encoding][tile_index];
           tile.pixels.append(tilePixels + offsets[t], 16 << (t >> 2));
           tile.weight = weight;
-
-          if (m_has_color_blocks) {
-            color_palettizer.clear();
-            for (uint p = 0; p < tile.pixels.size(); p++) {
-              const color_quad_u8& pixel = tile.pixels[p];
-              vec3F v(m_uint8_to_float[pixel[0]], m_uint8_to_float[pixel[1]], m_uint8_to_float[pixel[2]]);
-              color_palettizer.add_training_vec(m_params.m_perceptual ? vec3F(v[0] * 0.5f, v[1], v[2] * 0.25f): v, 1);
-            }
-            color_palettizer.generate_codebook(2);
-            bool single = color_palettizer.get_codebook_size() == 1;
-            bool reorder = !single && color_palettizer.get_codebook_entry(0).length() > color_palettizer.get_codebook_entry(1).length();
-            for (uint t = 0, i = 0; i < 2; i++) {
-              vec3F v = color_palettizer.get_codebook_entry(single ? 0 : reorder ? 1 - i : i);
-              for (uint c = 0; c < 3; c++, t++)
-                tile.color_endpoint[t] = v[c];
-            }
-          }
-
-          for (uint a = 0; a < m_num_alpha_blocks; a++) {
-            alpha_palettizer.clear();
-            for (uint c = m_params.m_alpha_component_indices[a], p = 0; p < tile.pixels.size(); p++)
-              alpha_palettizer.add_training_vec(vec1F(m_uint8_to_float[tile.pixels[p][c]]), 1);
-            alpha_palettizer.generate_codebook(2);
-            float v[2] = {alpha_palettizer.get_codebook_entry(0)[0], alpha_palettizer.get_codebook_entry(alpha_palettizer.get_codebook_size() - 1)[0]};
-            tile.alpha_endpoints[a][0] = math::minimum(v[0], v[1]);
-            tile.alpha_endpoints[a][1] = math::maximum(v[0], v[1]);
-          }
+          if (m_has_color_blocks)
+            tile.color_endpoint = palettize_color(tile.pixels.get_ptr(), tile.pixels.size());
+          for (uint a = 0; a < m_num_alpha_blocks; a++)
+            tile.alpha_endpoints[a] = palettize_alpha(tile.pixels.get_ptr(), tile.pixels.size(), m_params.m_alpha_component_indices[a]);
         }
 
         for (uint by = 0; by < 2; by++) {
@@ -385,8 +410,6 @@ void dxt_hc::determine_tiles_task_etc(uint64 data, void*) {
   uint8 selectors[32];
   uint tile_error[5];
   uint total_error[3];
-  tree_clusterizer<vec3F> color_palettizer;
-  tree_clusterizer<vec1F> alpha_palettizer;
 
   etc1_optimizer optimizer;
   etc1_optimizer::params params;
@@ -438,36 +461,13 @@ void dxt_hc::determine_tiles_task_etc(uint64 data, void*) {
         }
       }
 
-      vec2F alpha_endpoints;
-      if (m_num_alpha_blocks) {
-        alpha_palettizer.clear();
-        for (uint p = 0; p < 16; p++)
-          alpha_palettizer.add_training_vec(vec1F(m_uint8_to_float[tilePixels[p].a]), 1);
-        alpha_palettizer.generate_codebook(2);
-        float v[2] = {alpha_palettizer.get_codebook_entry(0)[0], alpha_palettizer.get_codebook_entry(alpha_palettizer.get_codebook_size() - 1)[0]};
-        alpha_endpoints[0] = math::minimum(v[0], v[1]);
-        alpha_endpoints[1] = math::maximum(v[0], v[1]);
-      }
-
+      vec2F alpha_endpoints = m_num_alpha_blocks ? palettize_alpha(tilePixels, 16, 3) : vec2F(cClear);
       for (uint tile_index = 0, s = best_encoding + 1; s; s >>= 1, tile_index++) {
         tile_details& tile = m_tiles[b | tile_index];
         uint t = tiles[best_encoding][tile_index];
         tile.pixels.append(tilePixels + offsets[t], 8 << (t >> 2));
         tile.weight = weight;
-        color_palettizer.clear();
-        for (uint p = 0; p < tile.pixels.size(); p++) {
-          const color_quad_u8& pixel = tile.pixels[p];
-          vec3F v(m_uint8_to_float[pixel[0]], m_uint8_to_float[pixel[1]], m_uint8_to_float[pixel[2]]);
-          color_palettizer.add_training_vec(m_params.m_perceptual ? vec3F(v[0] * 0.5f, v[1], v[2] * 0.25f) : v, 1);
-        }
-        color_palettizer.generate_codebook(2);
-        bool single = color_palettizer.get_codebook_size() == 1;
-        bool reorder = !single && color_palettizer.get_codebook_entry(0).length() > color_palettizer.get_codebook_entry(1).length();
-        for (uint t = 0, i = 0; i < 2; i++) {
-          vec3F v = color_palettizer.get_codebook_entry(single ? 0 : reorder ? 1 - i : i);
-          for (uint c = 0; c < 3; c++, t++)
-            tile.color_endpoint[t] = v[c];
-        }
+        tile.color_endpoint = palettize_color(tile.pixels.get_ptr(), tile.pixels.size());
         if (m_num_alpha_blocks)
           tile.alpha_endpoints[0] = alpha_endpoints;
       }
diff --git a/crnlib/crn_dxt_hc.h b/crnlib/crn_dxt_hc.h
@@ -186,6 +186,8 @@ class dxt_hc {
   int m_prev_phase_index;
   int m_prev_percentage_complete;
 
+  vec<6, float> palettize_color(color_quad_u8* pixels, uint pixels_count);
+  vec<2, float> palettize_alpha(color_quad_u8* pixels, uint pixels_count, uint comp_index);
   void determine_tiles_task(uint64 data, void* pData_ptr);
   void determine_tiles_task_etc(uint64 data, void* pData_ptr);
 
diff --git a/crnlib/crn_tree_clusterizer.h b/crnlib/crn_tree_clusterizer.h
@@ -25,15 +25,6 @@ class tree_clusterizer {
     }
   };
 
-  void clear() {
-    m_hist.clear();
-    m_vectors.clear();
-    m_vectorsInfo.clear();
-    m_codebook.clear();
-    m_nodes.clear();
-    m_node_index_map.clear();
-  }
-
   void add_training_vec(const VectorType& v, uint weight) {
     m_hist.push_back(std::make_pair(v, weight));
   }
@@ -498,4 +489,149 @@ class tree_clusterizer {
   }
 };
 
+template<typename VectorType>
+void split_vectors(VectorType (&vectors)[64], uint (&weights)[64], uint size, VectorType (&result)[2]) {
+  VectorType weightedVectors[64];
+  double weightedDotProducts[64];
+  VectorType centroid(cClear);
+  uint64 total_weight = 0;
+  double ttsum = 0.0f;
+  for (uint i = 0; i < size; i++) {
+    const VectorType& v = vectors[i];
+    const uint weight = weights[i];
+    weightedVectors[i] = v * (float)weight;
+    centroid += weightedVectors[i];
+    total_weight += weight;
+    weightedDotProducts[i] = v.dot(v) * weight;
+    ttsum += weightedDotProducts[i];
+  }
+  float variance = (float)(ttsum - (centroid.dot(centroid) / total_weight));
+  centroid *= (1.0f / total_weight);
+  result[0] = result[1] = centroid;
+  if (variance <= 0.0f || size == 1)
+    return;
+  VectorType furthest;
+  double furthest_dist = -1.0f;
+  for (uint i = 0; i < size; i++) {
+    const VectorType& v = vectors[i];
+    double dist = v.squared_distance(centroid);
+    if (dist > furthest_dist) {
+      furthest_dist = dist;
+      furthest = v;
+    }
+  }
+  VectorType opposite;
+  double opposite_dist = -1.0f;
+  for (uint i = 0; i < size; i++) {
+    const VectorType& v = vectors[i];
+    double dist = v.squared_distance(furthest);
+    if (dist > opposite_dist) {
+      opposite_dist = dist;
+      opposite = v;
+    }
+  }
+  VectorType left_child((furthest + centroid) * .5f);
+  VectorType right_child((opposite + centroid) * .5f);
+  if (size > 2) {
+    const uint N = VectorType::num_elements;
+    matrix<N, N, float> covar;
+    covar.clear();
+    for (uint i = 0; i < size; i++) {
+      const VectorType& v = vectors[i] - centroid;
+      const VectorType w = v * (float)weights[i];
+      for (uint x = 0; x < N; x++) {
+        for (uint y = x; y < N; y++)
+          covar[x][y] = covar[x][y] + v[x] * w[y];
+      }
+    }
+    float divider = (float)total_weight;
+    for (uint x = 0; x < N; x++) {
+      for (uint y = x; y < N; y++) {
+        covar[x][y] /= divider;
+        covar[y][x] = covar[x][y];
+      }
+    }
+    VectorType axis(1.0f);
+    for (uint iter = 0; iter < 10; iter++) {
+      VectorType x;
+      double max_sum = 0;
+      for (uint i = 0; i < N; i++) {
+        double sum = 0;
+        for (uint j = 0; j < N; j++)
+          sum += axis[j] * covar[i][j];
+        x[i] = (float)sum;
+        max_sum = i ? math::maximum(max_sum, sum) : sum;
+      }
+      if (max_sum != 0.0f)
+        x *= (float)(1.0f / max_sum);
+      axis = x;
+    }
+    axis.normalize();
+    VectorType new_left_child(0.0f);
+    VectorType new_right_child(0.0f);
+    double left_weight = 0.0f;
+    double right_weight = 0.0f;
+    for (uint i = 0; i < size; i++) {
+      const VectorType& v = vectors[i];
+      const float weight = (float)weights[i];
+      double t = (v - centroid) * axis;
+      if (t < 0.0f) {
+        new_left_child += weightedVectors[i];
+        left_weight += weight;
+      } else {
+        new_right_child += weightedVectors[i];
+        right_weight += weight;
+      }
+    }
+    if ((left_weight > 0.0f) && (right_weight > 0.0f)) {
+      left_child = new_left_child * (float)(1.0f / left_weight);
+      right_child = new_right_child * (float)(1.0f / right_weight);
+    }
+  }
+  uint64 left_weight = 0;
+  uint64 right_weight = 0;
+  float prev_total_variance = 1e+10f;
+  float left_variance = 0.0f;
+  float right_variance = 0.0f;
+  const uint cMaxLoops = 1024;
+  for (uint total_loops = 0; total_loops < cMaxLoops; total_loops++) {
+    VectorType new_left_child(cClear);
+    VectorType new_right_child(cClear);
+    double left_ttsum = 0.0f;
+    double right_ttsum = 0.0f;
+    left_weight = 0;
+    right_weight = 0;
+    for (uint i = 0; i < size; i++) {
+      const VectorType& v = vectors[i];
+      double left_dist2 = left_child.squared_distance(v);
+      double right_dist2 = right_child.squared_distance(v);
+      if (left_dist2 < right_dist2) {
+        new_left_child += weightedVectors[i];
+        left_ttsum += weightedDotProducts[i];
+        left_weight += weights[i];
+      } else {
+        new_right_child += weightedVectors[i];
+        right_ttsum += weightedDotProducts[i];
+        right_weight += weights[i];
+      }
+    }
+    if ((!left_weight) || (!right_weight))
+      return;
+    left_variance = (float)(left_ttsum - (new_left_child.dot(new_left_child) / left_weight));
+    right_variance = (float)(right_ttsum - (new_right_child.dot(new_right_child) / right_weight));
+    new_left_child *= (1.0f / left_weight);
+    new_right_child *= (1.0f / right_weight);
+    left_child = new_left_child;
+    right_child = new_right_child;
+    float total_variance = left_variance + right_variance;
+    if (total_variance < .00001f)
+      break;
+    if (((prev_total_variance - total_variance) / total_variance) < .00001f)
+      break;
+    prev_total_variance = total_variance;
+  }
+  result[0] = left_child;
+  result[1] = right_child;
+}
+
 }  // namespace crnlib