// ---------------------------------------------------------------------------- // This confidential and proprietary software may be used only as authorised // by a licensing agreement from Arm Limited. // (C) COPYRIGHT 2011-2019 Arm Limited, ALL RIGHTS RESERVED // The entire notice above must be reproduced on all authorised copies and // copies may only be made to the extent permitted by a licensing agreement // from Arm Limited. // ---------------------------------------------------------------------------- /** * @brief Functions for approximate partitioning by kmeans clustering. * * Do this in 2 stages: * 1: basic clustering, a couple of passes just to get a few clusters * 2: clustering based on line, a few passes until it seems to stabilize. * * After clustering is done, we use the clustering result to construct one * bitmap for each partition. We then scan though the partition table, counting * how well the bitmaps matched. */ #include "astc_codec_internals.h" // for k++ means, we need pseudo-random numbers, however using random numbers // directly results in unreproducible encoding results. As such, we will // instead just supply a handful of numbers from random.org, and apply an // algorithm similar to XKCD #221. (http://xkcd.com/221/) // cluster the texels using the k++ means clustering initialization algorithm. void kpp_initialize(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, float4 * cluster_centers) { int i; int texels_per_block = xdim * ydim * zdim; int cluster_center_samples[4]; // pick a random sample as first center-point. cluster_center_samples[0] = 145897 /* number from random.org */ % texels_per_block; int samples_selected = 1; float distances[MAX_TEXELS_PER_BLOCK]; // compute the distance to the first point. int sample = cluster_center_samples[0]; float4 center_color = float4(blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3]); float distance_sum = 0.0f; for (i = 0; i < texels_per_block; i++) { float4 color = float4(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3]); float4 diff = color - center_color; float distance = dot(diff, diff); distance_sum += distance; distances[i] = distance; } // more numbers from random.org float cluster_cutoffs[25] = { 0.952312f, 0.206893f, 0.835984f, 0.507813f, 0.466170f, 0.872331f, 0.488028f, 0.866394f, 0.363093f, 0.467905f, 0.812967f, 0.626220f, 0.932770f, 0.275454f, 0.832020f, 0.362217f, 0.318558f, 0.240113f, 0.009190f, 0.983995f, 0.566812f, 0.347661f, 0.731960f, 0.156391f, 0.297786f }; while (1) { // pick a point in a weighted-random fashion. float summa = 0.0f; float distance_cutoff = distance_sum * cluster_cutoffs[samples_selected + 5 * partition_count]; for (i = 0; i < texels_per_block; i++) { summa += distances[i]; if (summa >= distance_cutoff) break; } sample = i; if (sample >= texels_per_block) sample = texels_per_block - 1; cluster_center_samples[samples_selected] = sample; samples_selected++; if (samples_selected >= partition_count) break; // update the distances with the new point. center_color = float4(blk->work_data[4 * sample], blk->work_data[4 * sample + 1], blk->work_data[4 * sample + 2], blk->work_data[4 * sample + 3]); distance_sum = 0.0f; for (i = 0; i < texels_per_block; i++) { float4 color = float4(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3]); float4 diff = color - center_color; float distance = dot(diff, diff); distance = MIN(distance, distances[i]); distance_sum += distance; distances[i] = distance; } } // finally, gather up the results. for (i = 0; i < partition_count; i++) { int center_sample = cluster_center_samples[i]; float4 color = float4(blk->work_data[4 * center_sample], blk->work_data[4 * center_sample + 1], blk->work_data[4 * center_sample + 2], blk->work_data[4 * center_sample + 3]); cluster_centers[i] = color; } } // basic K-means clustering: given a set of cluster centers, // assign each texel to a partition void basic_kmeans_assign_pass(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const float4 * cluster_centers, int *partition_of_texel) { int i, j; int texels_per_block = xdim * ydim * zdim; float distances[MAX_TEXELS_PER_BLOCK]; int texels_per_partition[4]; texels_per_partition[0] = texels_per_block; for (i = 1; i < partition_count; i++) texels_per_partition[i] = 0; for (i = 0; i < texels_per_block; i++) { float4 color = float4(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3]); float4 diff = color - cluster_centers[0]; float distance = dot(diff, diff); distances[i] = distance; partition_of_texel[i] = 0; } for (j = 1; j < partition_count; j++) { float4 center_color = cluster_centers[j]; for (i = 0; i < texels_per_block; i++) { float4 color = float4(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3]); float4 diff = color - center_color; float distance = dot(diff, diff); if (distance < distances[i]) { distances[i] = distance; texels_per_partition[partition_of_texel[i]]--; texels_per_partition[j]++; partition_of_texel[i] = j; } } } // it is possible to get a situation where one of the partitions ends up // without any texels. In this case, we assign texel N to partition N; // this is silly, but ensures that every partition retains at least one texel. // Reassigning a texel in this manner may cause another partition to go empty, // so if we actually did a reassignment, we run the whole loop over again. int problem_case; do { problem_case = 0; for (i = 0; i < partition_count; i++) { if (texels_per_partition[i] == 0) { texels_per_partition[partition_of_texel[i]]--; texels_per_partition[i]++; partition_of_texel[i] = i; problem_case = 1; } } } while (problem_case != 0); } // basic k-means clustering: given a set of cluster assignments // for the texels, find the center position of each cluster. void basic_kmeans_update(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, const int *partition_of_texel, float4 * cluster_centers) { int i; int texels_per_block = xdim * ydim * zdim; float4 color_sum[4]; int weight_sum[4]; for (i = 0; i < partition_count; i++) { color_sum[i] = float4(0, 0, 0, 0); weight_sum[i] = 0; } // first, find the center-of-gravity in each cluster for (i = 0; i < texels_per_block; i++) { float4 color = float4(blk->work_data[4 * i], blk->work_data[4 * i + 1], blk->work_data[4 * i + 2], blk->work_data[4 * i + 3]); int part = partition_of_texel[i]; color_sum[part] = color_sum[part] + color; weight_sum[part]++; } for (i = 0; i < partition_count; i++) { cluster_centers[i] = color_sum[i] * (1.0f / weight_sum[i]); } } // after a few rounds of k-means-clustering, we should have a set of 2, 3 or 4 partitions; // we then turn this set into 2, 3 or 4 bitmaps. Then, for each of the 1024 partitions, // we try to match the bitmaps as well as possible. static inline int bitcount(uint64_t p) { if (sizeof(void *) > 4) { uint64_t mask1 = 0x5555555555555555ULL; uint64_t mask2 = 0x3333333333333333ULL; uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL; // best-known algorithm for 64-bit bitcount, assuming 64-bit processor // should probably be adapted for use with 32-bit processors and/or processors // with a POPCNT instruction, but leave that for later. p -= (p >> 1) & mask1; p = (p & mask2) + ((p >> 2) & mask2); p += p >> 4; p &= mask3; p *= 0x0101010101010101ULL; p >>= 56; return (int)p; } else { // on 32-bit processor, split the 64-bit input argument in two, // and bitcount each half separately. uint32_t p1 = (uint32_t) p; uint32_t p2 = (uint32_t) (p >> 32); uint32_t mask1 = 0x55555555U; uint32_t mask2 = 0x33333333U; uint32_t mask3 = 0x0F0F0F0FU; p1 = p1 - ((p1 >> 1) & mask1); p2 = p2 - ((p2 >> 1) & mask1); p1 = (p1 & mask2) + ((p1 >> 2) & mask2); p2 = (p2 & mask2) + ((p2 >> 2) & mask2); p1 += p1 >> 4; p2 += p2 >> 4; p1 &= mask3; p2 &= mask3; p1 += p2; p1 *= 0x01010101U; p1 >>= 24; return (int)p1; } } // compute the bit-mismatch for a partitioning in 2-partition mode static inline int partition_mismatch2(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1) { int v1 = bitcount(a0 ^ b0) + bitcount(a1 ^ b1); int v2 = bitcount(a0 ^ b1) + bitcount(a1 ^ b0); return MIN(v1, v2); } // compute the bit-mismatch for a partitioning in 3-partition mode static inline int partition_mismatch3(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t b0, uint64_t b1, uint64_t b2) { int p00 = bitcount(a0 ^ b0); int p01 = bitcount(a0 ^ b1); int p02 = bitcount(a0 ^ b2); int p10 = bitcount(a1 ^ b0); int p11 = bitcount(a1 ^ b1); int p12 = bitcount(a1 ^ b2); int p20 = bitcount(a2 ^ b0); int p21 = bitcount(a2 ^ b1); int p22 = bitcount(a2 ^ b2); int s0 = p11 + p22; int s1 = p12 + p21; int v0 = MIN(s0, s1) + p00; int s2 = p10 + p22; int s3 = p12 + p20; int v1 = MIN(s2, s3) + p01; int s4 = p10 + p21; int s5 = p11 + p20; int v2 = MIN(s4, s5) + p02; if (v1 < v0) v0 = v1; if (v2 < v0) v0 = v2; // 9 add, 5 MIN return v0; } static inline int MIN3(int a, int b, int c) { int d = MIN(a, b); return MIN(c, d); } // compute the bit-mismatch for a partitioning in 4-partition mode static inline int partition_mismatch4(uint64_t a0, uint64_t a1, uint64_t a2, uint64_t a3, uint64_t b0, uint64_t b1, uint64_t b2, uint64_t b3) { int p00 = bitcount(a0 ^ b0); int p01 = bitcount(a0 ^ b1); int p02 = bitcount(a0 ^ b2); int p03 = bitcount(a0 ^ b3); int p10 = bitcount(a1 ^ b0); int p11 = bitcount(a1 ^ b1); int p12 = bitcount(a1 ^ b2); int p13 = bitcount(a1 ^ b3); int p20 = bitcount(a2 ^ b0); int p21 = bitcount(a2 ^ b1); int p22 = bitcount(a2 ^ b2); int p23 = bitcount(a2 ^ b3); int p30 = bitcount(a3 ^ b0); int p31 = bitcount(a3 ^ b1); int p32 = bitcount(a3 ^ b2); int p33 = bitcount(a3 ^ b3); int mx23 = MIN(p22 + p33, p23 + p32); int mx13 = MIN(p21 + p33, p23 + p31); int mx12 = MIN(p21 + p32, p22 + p31); int mx03 = MIN(p20 + p33, p23 + p30); int mx02 = MIN(p20 + p32, p22 + p30); int mx01 = MIN(p21 + p30, p20 + p31); int v0 = p00 + MIN3(p11 + mx23, p12 + mx13, p13 + mx12); int v1 = p01 + MIN3(p10 + mx23, p12 + mx03, p13 + mx02); int v2 = p02 + MIN3(p11 + mx03, p10 + mx13, p13 + mx01); int v3 = p03 + MIN3(p11 + mx02, p12 + mx01, p10 + mx12); int x0 = MIN(v0, v1); int x1 = MIN(v2, v3); return MIN(x0, x1); // 16 bitcount, 17 MIN, 28 ADD } void count_partition_mismatch_bits(int xdim, int ydim, int zdim, int partition_count, const uint64_t bitmaps[4], int bitcounts[PARTITION_COUNT]) { int i; const partition_info *pi = get_partition_table(xdim, ydim, zdim, partition_count); if (partition_count == 2) { uint64_t bm0 = bitmaps[0]; uint64_t bm1 = bitmaps[1]; for (i = 0; i < PARTITION_COUNT; i++) { if (pi->partition_count == 2) { bitcounts[i] = partition_mismatch2(bm0, bm1, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1]); } else bitcounts[i] = 255; pi++; } } else if (partition_count == 3) { uint64_t bm0 = bitmaps[0]; uint64_t bm1 = bitmaps[1]; uint64_t bm2 = bitmaps[2]; for (i = 0; i < PARTITION_COUNT; i++) { if (pi->partition_count == 3) { bitcounts[i] = partition_mismatch3(bm0, bm1, bm2, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2]); } else bitcounts[i] = 255; pi++; } } else if (partition_count == 4) { uint64_t bm0 = bitmaps[0]; uint64_t bm1 = bitmaps[1]; uint64_t bm2 = bitmaps[2]; uint64_t bm3 = bitmaps[3]; for (i = 0; i < PARTITION_COUNT; i++) { if (pi->partition_count == 4) { bitcounts[i] = partition_mismatch4(bm0, bm1, bm2, bm3, pi->coverage_bitmaps[0], pi->coverage_bitmaps[1], pi->coverage_bitmaps[2], pi->coverage_bitmaps[3]); } else bitcounts[i] = 255; pi++; } } } // counting-sort on the mismatch-bits, thereby // sorting the partitions into an ordering. void get_partition_ordering_by_mismatch_bits(const int mismatch_bits[PARTITION_COUNT], int partition_ordering[PARTITION_COUNT]) { int i; int mscount[256]; for (i = 0; i < 256; i++) mscount[i] = 0; for (i = 0; i < PARTITION_COUNT; i++) mscount[mismatch_bits[i]]++; int summa = 0; for (i = 0; i < 256; i++) { int cnt = mscount[i]; mscount[i] = summa; summa += cnt; } for (i = 0; i < PARTITION_COUNT; i++) { int idx = mscount[mismatch_bits[i]]++; partition_ordering[idx] = i; } } void kmeans_compute_partition_ordering(int xdim, int ydim, int zdim, int partition_count, const imageblock * blk, int *ordering) { int i; const block_size_descriptor *bsd = get_block_size_descriptor(xdim, ydim, zdim); float4 cluster_centers[4]; int partition_of_texel[MAX_TEXELS_PER_BLOCK]; // 3 passes of plain k-means partitioning for (i = 0; i < 3; i++) { if (i == 0) kpp_initialize(xdim, ydim, zdim, partition_count, blk, cluster_centers); else basic_kmeans_update(xdim, ydim, zdim, partition_count, blk, partition_of_texel, cluster_centers); basic_kmeans_assign_pass(xdim, ydim, zdim, partition_count, blk, cluster_centers, partition_of_texel); } // at this point, we have a near-ideal partitioning. // construct bitmaps uint64_t bitmaps[4]; for (i = 0; i < 4; i++) bitmaps[i] = 0ULL; int texels_to_process = bsd->texelcount_for_bitmap_partitioning; for (i = 0; i < texels_to_process; i++) { int idx = bsd->texels_for_bitmap_partitioning[i]; bitmaps[partition_of_texel[idx]] |= 1ULL << i; } int bitcounts[PARTITION_COUNT]; // for each entry in the partition table, count bits of partition-mismatch. count_partition_mismatch_bits(xdim, ydim, zdim, partition_count, bitmaps, bitcounts); // finally, sort the partitions by bits-of-partition-mismatch get_partition_ordering_by_mismatch_bits(bitcounts, ordering); }