diff options
Diffstat (limited to '')
-rw-r--r-- | plug-ins/file-dds/dxt.c | 1522 |
1 files changed, 1522 insertions, 0 deletions
diff --git a/plug-ins/file-dds/dxt.c b/plug-ins/file-dds/dxt.c new file mode 100644 index 0000000..440875b --- /dev/null +++ b/plug-ins/file-dds/dxt.c @@ -0,0 +1,1522 @@ +/* + * DDS GIMP plugin + * + * Copyright (C) 2004-2012 Shawn Kirst <skirst@gmail.com>, + * with parts (C) 2003 Arne Reuter <homepage@arnereuter.de> where specified. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 51 Franklin Street, Fifth Floor + * Boston, MA 02110-1301, USA. + */ + +/* + * Parts of this code have been generously released in the public domain + * by Fabian 'ryg' Giesen. The original code can be found (at the time + * of writing) here: http://mollyrocket.com/forums/viewtopic.php?t=392 + * + * For more information about this code, see the README.dxt file that + * came with the source. + */ + +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <glib.h> + +#include "dds.h" +#include "dxt.h" +#include "endian_rw.h" +#include "mipmap.h" +#include "imath.h" +#include "vec.h" + +#include "dxt_tables.h" + +#define SWAP(a, b) do { typeof(a) t; t = a; a = b; b = t; } while(0) + +/* SIMD constants */ +static const vec4_t V4ZERO = VEC4_CONST1(0.0f); +static const vec4_t V4ONE = VEC4_CONST1(1.0f); +static const vec4_t V4HALF = VEC4_CONST1(0.5f); +static const vec4_t V4ONETHIRD = VEC4_CONST3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f); +static const vec4_t V4TWOTHIRDS = VEC4_CONST3(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f); +static const vec4_t V4GRID = VEC4_CONST3(31.0f, 63.0f, 31.0f); +static const vec4_t V4GRIDRCP = VEC4_CONST3(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f); +static const vec4_t V4EPSILON = VEC4_CONST1(1e-04f); + +typedef struct +{ + unsigned int single; + unsigned int alphamask; + vec4_t points[16]; + vec4_t palette[4]; + vec4_t max; + vec4_t min; + vec4_t metric; +} dxtblock_t; + +/* extract 4x4 BGRA block */ +static void +extract_block (const unsigned char *src, + int x, + int y, + int w, + int h, + unsigned char *block) +{ + int i, j; + int bw = MIN(w - x, 4); + int bh = MIN(h - y, 4); + int bx, by; + const int rem[] = + { + 0, 0, 0, 0, + 0, 1, 0, 1, + 0, 1, 2, 0, + 0, 1, 2, 3 + }; + + for (i = 0; i < 4; ++i) + { + by = rem[(bh - 1) * 4 + i] + y; + for (j = 0; j < 4; ++j) + { + bx = rem[(bw - 1) * 4 + j] + x; + block[(i * 4 * 4) + (j * 4) + 0] = + src[(by * (w * 4)) + (bx * 4) + 0]; + block[(i * 4 * 4) + (j * 4) + 1] = + src[(by * (w * 4)) + (bx * 4) + 1]; + block[(i * 4 * 4) + (j * 4) + 2] = + src[(by * (w * 4)) + (bx * 4) + 2]; + block[(i * 4 * 4) + (j * 4) + 3] = + src[(by * (w * 4)) + (bx * 4) + 3]; + } + } +} + +/* pack BGR8 to RGB565 */ +static inline unsigned short +pack_rgb565 (const unsigned char *c) +{ + return (mul8bit(c[2], 31) << 11) | + (mul8bit(c[1], 63) << 5) | + (mul8bit(c[0], 31) ); +} + +/* unpack RGB565 to BGR */ +static void +unpack_rgb565 (unsigned char *dst, + unsigned short v) +{ + int r = (v >> 11) & 0x1f; + int g = (v >> 5) & 0x3f; + int b = (v ) & 0x1f; + + dst[0] = (b << 3) | (b >> 2); + dst[1] = (g << 2) | (g >> 4); + dst[2] = (r << 3) | (r >> 2); +} + +/* linear interpolation at 1/3 point between a and b */ +static void +lerp_rgb13 (unsigned char *dst, + unsigned char *a, + unsigned char *b) +{ +#if 0 + dst[0] = blerp(a[0], b[0], 0x55); + dst[1] = blerp(a[1], b[1], 0x55); + dst[2] = blerp(a[2], b[2], 0x55); +#else + /* + * according to the S3TC/DX10 specs, this is the correct way to do the + * interpolation (with no rounding bias) + * + * dst = (2 * a + b) / 3; + */ + dst[0] = (2 * a[0] + b[0]) / 3; + dst[1] = (2 * a[1] + b[1]) / 3; + dst[2] = (2 * a[2] + b[2]) / 3; +#endif +} + +static void +vec4_endpoints_to_565 (int *start, + int *end, + const vec4_t a, + const vec4_t b) +{ + int c[8] __attribute__((aligned(16))); + vec4_t ta = a * V4GRID + V4HALF; + vec4_t tb = b * V4GRID + V4HALF; + +#ifdef USE_SSE +# ifdef __SSE2__ + const __m128i C565 = _mm_setr_epi16(31, 63, 31, 0, 31, 63, 31, 0); + __m128i ia = _mm_cvttps_epi32(ta); + __m128i ib = _mm_cvttps_epi32(tb); + __m128i zero = _mm_setzero_si128(); + __m128i words = _mm_packs_epi32(ia, ib); + words = _mm_min_epi16(C565, _mm_max_epi16(zero, words)); + *((__m128i *)&c[0]) = _mm_unpacklo_epi16(words, zero); + *((__m128i *)&c[4]) = _mm_unpackhi_epi16(words, zero); +# else + const __m64 C565 = _mm_setr_pi16(31, 63, 31, 0); + __m64 lo, hi, c0, c1; + __m64 zero = _mm_setzero_si64(); + lo = _mm_cvttps_pi32(ta); + hi = _mm_cvttps_pi32(_mm_movehl_ps(ta, ta)); + c0 = _mm_packs_pi32(lo, hi); + lo = _mm_cvttps_pi32(tb); + hi = _mm_cvttps_pi32(_mm_movehl_ps(tb, tb)); + c1 = _mm_packs_pi32(lo, hi); + c0 = _mm_min_pi16(C565, _mm_max_pi16(zero, c0)); + c1 = _mm_min_pi16(C565, _mm_max_pi16(zero, c1)); + *((__m64 *)&c[0]) = _mm_unpacklo_pi16(c0, zero); + *((__m64 *)&c[2]) = _mm_unpackhi_pi16(c0, zero); + *((__m64 *)&c[4]) = _mm_unpacklo_pi16(c1, zero); + *((__m64 *)&c[6]) = _mm_unpackhi_pi16(c1, zero); + _mm_empty(); +# endif +#else + c[0] = (int)ta[0]; c[4] = (int)tb[0]; + c[1] = (int)ta[1]; c[5] = (int)tb[1]; + c[2] = (int)ta[2]; c[6] = (int)tb[2]; + c[0] = MIN(31, MAX(0, c[0])); + c[1] = MIN(63, MAX(0, c[1])); + c[2] = MIN(31, MAX(0, c[2])); + c[4] = MIN(31, MAX(0, c[4])); + c[5] = MIN(63, MAX(0, c[5])); + c[6] = MIN(31, MAX(0, c[6])); +#endif + + *start = ((c[2] << 11) | (c[1] << 5) | c[0]); + *end = ((c[6] << 11) | (c[5] << 5) | c[4]); +} + +static void +dxtblock_init (dxtblock_t *dxtb, + const unsigned char *block, + int flags) +{ + int i, c0, c; + int bc1 = (flags & DXT_BC1); + float x, y, z; + vec4_t min, max, center, t, cov, inset; + + dxtb->single = 1; + dxtb->alphamask = 0; + + if(flags & DXT_PERCEPTUAL) + /* ITU-R BT.709 luma coefficients */ + dxtb->metric = vec4_set(0.2126f, 0.7152f, 0.0722f, 0.0f); + else + dxtb->metric = vec4_set(1.0f, 1.0f, 1.0f, 0.0f); + + c0 = GETL24(block); + + for (i = 0; i < 16; ++i) + { + if (bc1 && (block[4 * i + 3] < 128)) + dxtb->alphamask |= (3 << (2 * i)); + + x = (float)block[4 * i + 0] / 255.0f; + y = (float)block[4 * i + 1] / 255.0f; + z = (float)block[4 * i + 2] / 255.0f; + + dxtb->points[i] = vec4_set(x, y, z, 0); + + c = GETL24(&block[4 * i]); + dxtb->single = dxtb->single && (c == c0); + } + + // no need to continue if this is a single color block + if (dxtb->single) + return; + + min = vec4_set1(1.0f); + max = vec4_zero(); + + // get bounding box extents + for (i = 0; i < 16; ++i) + { + min = vec4_min(min, dxtb->points[i]); + max = vec4_max(max, dxtb->points[i]); + } + + // select diagonal + center = (max + min) * V4HALF; + cov = vec4_zero(); + for (i = 0; i < 16; ++i) + { + t = dxtb->points[i] - center; + cov += t * vec4_splatz(t); + } + +#ifdef USE_SSE + { + __m128 mask, tmp; + // get mask + mask = _mm_cmplt_ps(cov, _mm_setzero_ps()); + // clear high bits (z, w) + mask = _mm_movelh_ps(mask, _mm_setzero_ps()); + // mask and combine + tmp = _mm_or_ps(_mm_and_ps(mask, min), _mm_andnot_ps(mask, max)); + min = _mm_or_ps(_mm_and_ps(mask, max), _mm_andnot_ps(mask, min)); + max = tmp; + } +#else + { + float x0, x1, y0, y1; + x0 = max[0]; + y0 = max[1]; + x1 = min[0]; + y1 = min[1]; + + if (cov[0] < 0) SWAP(x0, x1); + if (cov[1] < 0) SWAP(y0, y1); + + max[0] = x0; + max[1] = y0; + min[0] = x1; + min[1] = y1; + } +#endif + + // inset bounding box and clamp to [0,1] + inset = (max - min) * vec4_set1(1.0f / 16.0f) - vec4_set1((8.0f / 255.0f) / 16.0f); + max = vec4_min(V4ONE, vec4_max(V4ZERO, max - inset)); + min = vec4_min(V4ONE, vec4_max(V4ZERO, min + inset)); + + // clamp to color space and save + dxtb->max = vec4_trunc(V4GRID * max + V4HALF) * V4GRIDRCP; + dxtb->min = vec4_trunc(V4GRID * min + V4HALF) * V4GRIDRCP; +} + +static void +construct_palette3 (dxtblock_t *dxtb) +{ + dxtb->palette[0] = dxtb->max; + dxtb->palette[1] = dxtb->min; + dxtb->palette[2] = (dxtb->max * V4HALF) + (dxtb->min * V4HALF); + dxtb->palette[3] = vec4_zero(); +} + +static void +construct_palette4 (dxtblock_t *dxtb) +{ + dxtb->palette[0] = dxtb->max; + dxtb->palette[1] = dxtb->min; + dxtb->palette[2] = (dxtb->max * V4TWOTHIRDS) + (dxtb->min * V4ONETHIRD ); + dxtb->palette[3] = (dxtb->max * V4ONETHIRD ) + (dxtb->min * V4TWOTHIRDS); +} + +/* + * from nvidia-texture-tools; see LICENSE.nvtt for copyright information + */ +static void +optimize_endpoints3 (dxtblock_t *dxtb, + unsigned int indices, + vec4_t *max, + vec4_t *min) +{ + float alpha, beta; + vec4_t alpha2_sum, alphax_sum; + vec4_t beta2_sum, betax_sum; + vec4_t alphabeta_sum, a, b, factor; + int i, bits; + + alpha2_sum = beta2_sum = alphabeta_sum = vec4_zero(); + alphax_sum = vec4_zero(); + betax_sum = vec4_zero(); + + for (i = 0; i < 16; ++i) + { + bits = indices >> (2 * i); + + // skip alpha pixels + if ((bits & 3) == 3) + continue; + + beta = (float)(bits & 1); + if (bits & 2) + beta = 0.5f; + alpha = 1.0f - beta; + + a = vec4_set1(alpha); + b = vec4_set1(beta); + alpha2_sum += a * a; + beta2_sum += b * b; + alphabeta_sum += a * b; + alphax_sum += dxtb->points[i] * a; + betax_sum += dxtb->points[i] * b; + } + + factor = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; + if (vec4_cmplt(factor, V4EPSILON)) + return; + factor = vec4_rcp(factor); + + a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + + // clamp to the color space + a = vec4_min(V4ONE, vec4_max(V4ZERO, a)); + b = vec4_min(V4ONE, vec4_max(V4ZERO, b)); + a = vec4_trunc(V4GRID * a + V4HALF) * V4GRIDRCP; + b = vec4_trunc(V4GRID * b + V4HALF) * V4GRIDRCP; + + *max = a; + *min = b; +} + +/* + * from nvidia-texture-tools; see LICENSE.nvtt for copyright information + */ +static void +optimize_endpoints4 (dxtblock_t *dxtb, + unsigned int indices, + vec4_t *max, + vec4_t *min) +{ + float alpha, beta; + vec4_t alpha2_sum, alphax_sum; + vec4_t beta2_sum, betax_sum; + vec4_t alphabeta_sum, a, b, factor; + int i, bits; + + alpha2_sum = beta2_sum = alphabeta_sum = vec4_zero(); + alphax_sum = vec4_zero(); + betax_sum = vec4_zero(); + + for (i = 0; i < 16; ++i) + { + bits = indices >> (2 * i); + + beta = (float)(bits & 1); + if (bits & 2) + beta = (1.0f + beta) / 3.0f; + alpha = 1.0f - beta; + + a = vec4_set1(alpha); + b = vec4_set1(beta); + alpha2_sum += a * a; + beta2_sum += b * b; + alphabeta_sum += a * b; + alphax_sum += dxtb->points[i] * a; + betax_sum += dxtb->points[i] * b; + } + + factor = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; + if (vec4_cmplt(factor, V4EPSILON)) + return; + factor = vec4_rcp(factor); + + a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor; + b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor; + + // clamp to the color space + a = vec4_min(V4ONE, vec4_max(V4ZERO, a)); + b = vec4_min(V4ONE, vec4_max(V4ZERO, b)); + a = vec4_trunc(V4GRID * a + V4HALF) * V4GRIDRCP; + b = vec4_trunc(V4GRID * b + V4HALF) * V4GRIDRCP; + + *max = a; + *min = b; +} + +static unsigned int +match_colors3 (dxtblock_t *dxtb) +{ + int i, idx; + unsigned int indices = 0; + vec4_t t0, t1, t2; +#ifdef USE_SSE + vec4_t d, bits, zero = _mm_setzero_ps(); + int mask; +#else + float d0, d1, d2; +#endif + + // match each point to the closest color + for (i = 0; i < 16; ++i) + { + // skip alpha pixels + if (((dxtb->alphamask >> (2 * i)) & 3) == 3) + { + indices |= (3 << (2 * i)); + continue; + } + + t0 = (dxtb->points[i] - dxtb->palette[0]) * dxtb->metric; + t1 = (dxtb->points[i] - dxtb->palette[1]) * dxtb->metric; + t2 = (dxtb->points[i] - dxtb->palette[2]) * dxtb->metric; + +#ifdef USE_SSE + _MM_TRANSPOSE4_PS(t0, t1, t2, zero); + d = t0 * t0 + t1 * t1 + t2 * t2; + bits = _mm_cmplt_ps(_mm_shuffle_ps(d, d, _MM_SHUFFLE(3, 1, 0, 0)), + _mm_shuffle_ps(d, d, _MM_SHUFFLE(3, 2, 2, 1))); + mask = _mm_movemask_ps(bits); + if((mask & 3) == 3) idx = 0; + else if(mask & 4) idx = 1; + else idx = 2; +#else + d0 = vec4_dot(t0, t0); + d1 = vec4_dot(t1, t1); + d2 = vec4_dot(t2, t2); + + if ((d0 < d1) && (d0 < d2)) + idx = 0; + else if (d1 < d2) + idx = 1; + else + idx = 2; +#endif + + indices |= (idx << (2 * i)); + } + + return indices; +} + +static unsigned int +match_colors4 (dxtblock_t *dxtb) +{ + int i; + unsigned int idx, indices = 0; + unsigned int b0, b1, b2, b3, b4; + unsigned int x0, x1, x2; + vec4_t t0, t1, t2, t3; +#ifdef USE_SSE + vec4_t d; +#else + float d[4]; +#endif + + // match each point to the closest color + for (i = 0; i < 16; ++i) + { + t0 = (dxtb->points[i] - dxtb->palette[0]) * dxtb->metric; + t1 = (dxtb->points[i] - dxtb->palette[1]) * dxtb->metric; + t2 = (dxtb->points[i] - dxtb->palette[2]) * dxtb->metric; + t3 = (dxtb->points[i] - dxtb->palette[3]) * dxtb->metric; + +#ifdef USE_SSE + _MM_TRANSPOSE4_PS(t0, t1, t2, t3); + d = t0 * t0 + t1 * t1 + t2 * t2; +#else + d[0] = vec4_dot(t0, t0); + d[1] = vec4_dot(t1, t1); + d[2] = vec4_dot(t2, t2); + d[3] = vec4_dot(t3, t3); +#endif + + b0 = d[0] > d[3]; + b1 = d[1] > d[2]; + b2 = d[0] > d[2]; + b3 = d[1] > d[3]; + b4 = d[2] > d[3]; + + x0 = b1 & b2; + x1 = b0 & b3; + x2 = b0 & b4; + + idx = x2 | ((x0 | x1) << 1); + + indices |= (idx << (2 * i)); + } + + return indices; +} + +static float +compute_error3 (dxtblock_t *dxtb, + unsigned int indices) +{ + int i, idx; + float error = 0; + vec4_t t; + + // compute error + for (i = 0; i < 16; ++i) + { + idx = (indices >> (2 * i)) & 3; + // skip alpha pixels + if(idx == 3) + continue; + t = (dxtb->points[i] - dxtb->palette[idx]) * dxtb->metric; + error += vec4_dot(t, t); + } + + return error; +} + +static float +compute_error4 (dxtblock_t *dxtb, + unsigned int indices) +{ + int i, idx; + float error = 0; + +#ifdef USE_SSE + vec4_t a0, a1, a2, a3; + vec4_t b0, b1, b2, b3; + vec4_t d; + + for (i = 0; i < 4; ++i) + { + idx = indices >> (8 * i); + a0 = dxtb->points[4 * i + 0]; + a1 = dxtb->points[4 * i + 1]; + a2 = dxtb->points[4 * i + 2]; + a3 = dxtb->points[4 * i + 3]; + b0 = dxtb->palette[(idx ) & 3]; + b1 = dxtb->palette[(idx >> 2) & 3]; + b2 = dxtb->palette[(idx >> 4) & 3]; + b3 = dxtb->palette[(idx >> 6) & 3]; + a0 = (a0 - b0) * dxtb->metric; + a1 = (a1 - b1) * dxtb->metric; + a2 = (a2 - b2) * dxtb->metric; + a3 = (a3 - b3) * dxtb->metric; + _MM_TRANSPOSE4_PS(a0, a1, a2, a3); + d = a0 * a0 + a1 * a1 + a2 * a2; + error += vec4_accum(d); + } +#else + vec4_t t; + + // compute error + for (i = 0; i < 16; ++i) + { + idx = (indices >> (2 * i)) & 3; + t = (dxtb->points[i] - dxtb->palette[idx]) * dxtb->metric; + error += vec4_dot(t, t); + } +#endif + + return error; +} + +static unsigned int +compress3 (dxtblock_t *dxtb) +{ + const int MAX_ITERATIONS = 8; + int i; + unsigned int indices, bestindices; + float error, besterror = FLT_MAX; + vec4_t oldmax, oldmin; + + construct_palette3(dxtb); + + indices = match_colors3(dxtb); + bestindices = indices; + + for (i = 0; i < MAX_ITERATIONS; ++i) + { + oldmax = dxtb->max; + oldmin = dxtb->min; + + optimize_endpoints3(dxtb, indices, &dxtb->max, &dxtb->min); + construct_palette3(dxtb); + indices = match_colors3(dxtb); + error = compute_error3(dxtb, indices); + + if (error < besterror) + { + besterror = error; + bestindices = indices; + } + else + { + dxtb->max = oldmax; + dxtb->min = oldmin; + break; + } + } + + return bestindices; +} + +static unsigned int +compress4 (dxtblock_t *dxtb) +{ + const int MAX_ITERATIONS = 8; + int i; + unsigned int indices, bestindices; + float error, besterror = FLT_MAX; + vec4_t oldmax, oldmin; + + construct_palette4(dxtb); + + indices = match_colors4(dxtb); + bestindices = indices; + + for (i = 0; i < MAX_ITERATIONS; ++i) + { + oldmax = dxtb->max; + oldmin = dxtb->min; + + optimize_endpoints4(dxtb, indices, &dxtb->max, &dxtb->min); + construct_palette4(dxtb); + indices = match_colors4(dxtb); + error = compute_error4(dxtb, indices); + + if (error < besterror) + { + besterror = error; + bestindices = indices; + } + else + { + dxtb->max = oldmax; + dxtb->min = oldmin; + break; + } + } + + return bestindices; +} + +static void +encode_color_block (unsigned char *dst, + unsigned char *block, + int flags) +{ + dxtblock_t dxtb; + int max16, min16; + unsigned int indices, mask; + + dxtblock_init(&dxtb, block, flags); + + if (dxtb.single) // single color block + { + max16 = (omatch5[block[2]][0] << 11) | + (omatch6[block[1]][0] << 5) | + (omatch5[block[0]][0] ); + min16 = (omatch5[block[2]][1] << 11) | + (omatch6[block[1]][1] << 5) | + (omatch5[block[0]][1] ); + + indices = 0xaaaaaaaa; // 101010... + + if ((flags & DXT_BC1) && dxtb.alphamask) + { + // DXT1 compression, non-opaque block. Add alpha indices. + indices |= dxtb.alphamask; + if (max16 > min16) + SWAP(max16, min16); + } + else if (max16 < min16) + { + SWAP(max16, min16); + indices ^= 0x55555555; // 010101... + } + } + else if ((flags & DXT_BC1) && dxtb.alphamask) // DXT1 compression, non-opaque block + { + indices = compress3(&dxtb); + + vec4_endpoints_to_565(&max16, &min16, dxtb.max, dxtb.min); + + if (max16 > min16) + { + SWAP(max16, min16); + // remap indices 0 -> 1, 1 -> 0 + mask = indices & 0xaaaaaaaa; + mask = mask | (mask >> 1); + indices = (indices & mask) | ((indices ^ 0x55555555) & ~mask); + } + } + else + { + indices = compress4(&dxtb); + + vec4_endpoints_to_565(&max16, &min16, dxtb.max, dxtb.min); + + if (max16 < min16) + { + SWAP(max16, min16); + indices ^= 0x55555555; // 010101... + } + } + + PUTL16(dst + 0, max16); + PUTL16(dst + 2, min16); + PUTL32(dst + 4, indices); +} + +static void +get_min_max_YCoCg (const unsigned char *block, + unsigned char *mincolor, + unsigned char *maxcolor) +{ + int i; + + mincolor[2] = mincolor[1] = 255; + maxcolor[2] = maxcolor[1] = 0; + + for (i = 0; i < 16; ++i) + { + if (block[4 * i + 2] < mincolor[2]) mincolor[2] = block[4 * i + 2]; + if (block[4 * i + 1] < mincolor[1]) mincolor[1] = block[4 * i + 1]; + if (block[4 * i + 2] > maxcolor[2]) maxcolor[2] = block[4 * i + 2]; + if (block[4 * i + 1] > maxcolor[1]) maxcolor[1] = block[4 * i + 1]; + } +} + +static void +scale_YCoCg (unsigned char *block, + unsigned char *mincolor, + unsigned char *maxcolor) +{ + const int s0 = 128 / 2 - 1; + const int s1 = 128 / 4 - 1; + int m0, m1, m2, m3; + int mask0, mask1, scale; + int i; + + m0 = abs(mincolor[2] - 128); + m1 = abs(mincolor[1] - 128); + m2 = abs(maxcolor[2] - 128); + m3 = abs(maxcolor[1] - 128); + + if (m1 > m0) m0 = m1; + if (m3 > m2) m2 = m3; + if (m2 > m0) m0 = m2; + + mask0 = -(m0 <= s0); + mask1 = -(m0 <= s1); + scale = 1 + (1 & mask0) + (2 & mask1); + + mincolor[2] = (mincolor[2] - 128) * scale + 128; + mincolor[1] = (mincolor[1] - 128) * scale + 128; + mincolor[0] = (scale - 1) << 3; + + maxcolor[2] = (maxcolor[2] - 128) * scale + 128; + maxcolor[1] = (maxcolor[1] - 128) * scale + 128; + maxcolor[0] = (scale - 1) << 3; + + for (i = 0; i < 16; ++i) + { + block[i * 4 + 2] = (block[i * 4 + 2] - 128) * scale + 128; + block[i * 4 + 1] = (block[i * 4 + 1] - 128) * scale + 128; + } +} + +#define INSET_SHIFT 4 + +static void +inset_bbox_YCoCg (unsigned char *mincolor, + unsigned char *maxcolor) +{ + int inset[4], mini[4], maxi[4]; + + inset[2] = (maxcolor[2] - mincolor[2]) - ((1 << (INSET_SHIFT - 1)) - 1); + inset[1] = (maxcolor[1] - mincolor[1]) - ((1 << (INSET_SHIFT - 1)) - 1); + + mini[2] = ((mincolor[2] << INSET_SHIFT) + inset[2]) >> INSET_SHIFT; + mini[1] = ((mincolor[1] << INSET_SHIFT) + inset[1]) >> INSET_SHIFT; + + maxi[2] = ((maxcolor[2] << INSET_SHIFT) - inset[2]) >> INSET_SHIFT; + maxi[1] = ((maxcolor[1] << INSET_SHIFT) - inset[1]) >> INSET_SHIFT; + + mini[2] = (mini[2] >= 0) ? mini[2] : 0; + mini[1] = (mini[1] >= 0) ? mini[1] : 0; + + maxi[2] = (maxi[2] <= 255) ? maxi[2] : 255; + maxi[1] = (maxi[1] <= 255) ? maxi[1] : 255; + + mincolor[2] = (mini[2] & 0xf8) | (mini[2] >> 5); + mincolor[1] = (mini[1] & 0xfc) | (mini[1] >> 6); + + maxcolor[2] = (maxi[2] & 0xf8) | (maxi[2] >> 5); + maxcolor[1] = (maxi[1] & 0xfc) | (maxi[1] >> 6); +} + +static void +select_diagonal_YCoCg (const unsigned char *block, + unsigned char *mincolor, + unsigned char *maxcolor) +{ + unsigned char mid0, mid1, side, mask, b0, b1, c0, c1; + int i; + + mid0 = ((int)mincolor[2] + maxcolor[2] + 1) >> 1; + mid1 = ((int)mincolor[1] + maxcolor[1] + 1) >> 1; + + side = 0; + for (i = 0; i < 16; ++i) + { + b0 = block[i * 4 + 2] >= mid0; + b1 = block[i * 4 + 1] >= mid1; + side += (b0 ^ b1); + } + + mask = -(side > 8); + mask &= -(mincolor[2] != maxcolor[2]); + + c0 = mincolor[1]; + c1 = maxcolor[1]; + + c0 ^= c1; + c1 ^= c0 & mask; + c0 ^= c1; + + mincolor[1] = c0; + maxcolor[1] = c1; +} + +static void +encode_YCoCg_block (unsigned char *dst, + unsigned char *block) +{ + unsigned char colors[4][3], *maxcolor, *mincolor; + unsigned int mask; + int c0, c1, d0, d1, d2, d3; + int b0, b1, b2, b3, b4; + int x0, x1, x2; + int i, idx; + + maxcolor = &colors[0][0]; + mincolor = &colors[1][0]; + + get_min_max_YCoCg(block, mincolor, maxcolor); + scale_YCoCg(block, mincolor, maxcolor); + inset_bbox_YCoCg(mincolor, maxcolor); + select_diagonal_YCoCg(block, mincolor, maxcolor); + + lerp_rgb13(&colors[2][0], maxcolor, mincolor); + lerp_rgb13(&colors[3][0], mincolor, maxcolor); + + mask = 0; + + for (i = 0; i < 16; ++i) + { + c0 = block[4 * i + 2]; + c1 = block[4 * i + 1]; + + d0 = abs(colors[0][2] - c0) + abs(colors[0][1] - c1); + d1 = abs(colors[1][2] - c0) + abs(colors[1][1] - c1); + d2 = abs(colors[2][2] - c0) + abs(colors[2][1] - c1); + d3 = abs(colors[3][2] - c0) + abs(colors[3][1] - c1); + + b0 = d0 > d3; + b1 = d1 > d2; + b2 = d0 > d2; + b3 = d1 > d3; + b4 = d2 > d3; + + x0 = b1 & b2; + x1 = b0 & b3; + x2 = b0 & b4; + + idx = (x2 | ((x0 | x1) << 1)); + + mask |= idx << (2 * i); + } + + PUTL16(dst + 0, pack_rgb565(maxcolor)); + PUTL16(dst + 2, pack_rgb565(mincolor)); + PUTL32(dst + 4, mask); +} + +/* write DXT3 alpha block */ +static void +encode_alpha_block_BC2 (unsigned char *dst, + const unsigned char *block) +{ + int i, a1, a2; + + block += 3; + + for (i = 0; i < 8; ++i) + { + a1 = mul8bit(block[8 * i + 0], 0x0f); + a2 = mul8bit(block[8 * i + 4], 0x0f); + *dst++ = (a2 << 4) | a1; + } +} + +/* Write DXT5 alpha block */ +static void +encode_alpha_block_BC3 (unsigned char *dst, + const unsigned char *block, + const int offset) +{ + int i, v, mn, mx; + int dist, bias, dist2, dist4, bits, mask; + int a, idx, t; + + block += offset; + block += 3; + + /* find min/max alpha pair */ + mn = mx = block[0]; + for (i = 0; i < 16; ++i) + { + v = block[4 * i]; + if(v > mx) mx = v; + if(v < mn) mn = v; + } + + /* encode them */ + *dst++ = mx; + *dst++ = mn; + + /* + * determine bias and emit indices + * given the choice of mx/mn, these indices are optimal: + * http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ + */ + dist = mx - mn; + dist4 = dist * 4; + dist2 = dist * 2; + bias = (dist < 8) ? (dist - 1) : (dist / 2 + 2); + bias -= mn * 7; + bits = 0; + mask = 0; + + for (i = 0; i < 16; ++i) + { + a = block[4 * i] * 7 + bias; + + /* select index. this is a "linear scale" lerp factor between 0 + (val=min) and 7 (val=max). */ + t = (a >= dist4) ? -1 : 0; idx = t & 4; a -= dist4 & t; + t = (a >= dist2) ? -1 : 0; idx += t & 2; a -= dist2 & t; + idx += (a >= dist); + + /* turn linear scale into DXT index (0/1 are extremal pts) */ + idx = -idx & 7; + idx ^= (2 > idx); + + /* write index */ + mask |= idx << bits; + if ((bits += 3) >= 8) + { + *dst++ = mask; + mask >>= 8; + bits -= 8; + } + } +} + +#define BLOCK_COUNT(w, h) ((((h) + 3) >> 2) * (((w) + 3) >> 2)) +#define BLOCK_OFFSET(x, y, w, bs) (((y) >> 2) * ((bs) * (((w) + 3) >> 2)) + ((bs) * ((x) >> 2))) + +static void +compress_BC1 (unsigned char *dst, + const unsigned char *src, + int w, + int h, + int flags) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 8); + extract_block(src, x, y, w, h, block); + encode_color_block(p, block, DXT_BC1 | flags); + } +} + +static void +compress_BC2 (unsigned char *dst, + const unsigned char *src, + int w, + int h, + int flags) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 16); + extract_block(src, x, y, w, h, block); + encode_alpha_block_BC2(p, block); + encode_color_block(p + 8, block, DXT_BC2 | flags); + } +} + +static void +compress_BC3 (unsigned char *dst, + const unsigned char *src, + int w, + int h, + int flags) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 16); + extract_block(src, x, y, w, h, block); + encode_alpha_block_BC3(p, block, 0); + encode_color_block(p + 8, block, DXT_BC3 | flags); + } +} + +static void +compress_BC4 (unsigned char *dst, + const unsigned char *src, + int w, + int h) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 8); + extract_block(src, x, y, w, h, block); + encode_alpha_block_BC3(p, block, -1); + } +} + +static void +compress_BC5 (unsigned char *dst, + const unsigned char *src, + int w, + int h) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 16); + extract_block(src, x, y, w, h, block); + encode_alpha_block_BC3(p, block, -2); + encode_alpha_block_BC3(p + 8, block, -1); + } +} + +static void +compress_YCoCg (unsigned char *dst, + const unsigned char *src, + int w, + int h) +{ + const unsigned int block_count = BLOCK_COUNT(w, h); + unsigned int i; + unsigned char block[64], *p; + int x, y; + +#ifdef _OPENMP +#pragma omp parallel for schedule(dynamic, 256) private(block, p, x, y) +#endif + for (i = 0; i < block_count; ++i) + { + x = (i % ((w + 3) >> 2)) << 2; + y = (i / ((w + 3) >> 2)) << 2; + p = dst + BLOCK_OFFSET(x, y, w, 16); + extract_block(src, x, y, w, h, block); + encode_alpha_block_BC3(p, block, 0); + encode_YCoCg_block(p + 8, block); + } +} + +int +dxt_compress (unsigned char *dst, + unsigned char *src, + int format, + unsigned int width, + unsigned int height, + int bpp, + int mipmaps, + int flags) +{ + int i, size, w, h; + unsigned int offset; + unsigned char *tmp = NULL; + int j; + unsigned char *s; + + if (bpp == 1) + { + /* grayscale promoted to BGRA */ + + size = get_mipmapped_size(width, height, 4, 0, mipmaps, + DDS_COMPRESS_NONE); + tmp = g_malloc(size); + + for (i = j = 0; j < size; ++i, j += 4) + { + tmp[j + 0] = src[i]; + tmp[j + 1] = src[i]; + tmp[j + 2] = src[i]; + tmp[j + 3] = 255; + } + + bpp = 4; + } + else if (bpp == 2) + { + /* gray-alpha promoted to BGRA */ + + size = get_mipmapped_size(width, height, 4, 0, mipmaps, + DDS_COMPRESS_NONE); + tmp = g_malloc(size); + + for (i = j = 0; j < size; i += 2, j += 4) + { + tmp[j + 0] = src[i]; + tmp[j + 1] = src[i]; + tmp[j + 2] = src[i]; + tmp[j + 3] = src[i + 1]; + } + + bpp = 4; + } + else if (bpp == 3) + { + size = get_mipmapped_size(width, height, 4, 0, mipmaps, + DDS_COMPRESS_NONE); + tmp = g_malloc(size); + + for (i = j = 0; j < size; i += 3, j += 4) + { + tmp[j + 0] = src[i + 0]; + tmp[j + 1] = src[i + 1]; + tmp[j + 2] = src[i + 2]; + tmp[j + 3] = 255; + } + + bpp = 4; + } + + offset = 0; + w = width; + h = height; + s = tmp ? tmp : src; + + for (i = 0; i < mipmaps; ++i) + { + switch (format) + { + case DDS_COMPRESS_BC1: + compress_BC1(dst + offset, s, w, h, flags); + break; + case DDS_COMPRESS_BC2: + compress_BC2(dst + offset, s, w, h, flags); + break; + case DDS_COMPRESS_BC3: + case DDS_COMPRESS_BC3N: + case DDS_COMPRESS_RXGB: + case DDS_COMPRESS_AEXP: + case DDS_COMPRESS_YCOCG: + compress_BC3(dst + offset, s, w, h, flags); + break; + case DDS_COMPRESS_BC4: + compress_BC4(dst + offset, s, w, h); + break; + case DDS_COMPRESS_BC5: + compress_BC5(dst + offset, s, w, h); + break; + case DDS_COMPRESS_YCOCGS: + compress_YCoCg(dst + offset, s, w, h); + break; + default: + compress_BC3(dst + offset, s, w, h, flags); + break; + } + s += (w * h * bpp); + offset += get_mipmapped_size(w, h, 0, 0, 1, format); + w = MAX(1, w >> 1); + h = MAX(1, h >> 1); + } + + if (tmp) + g_free(tmp); + + return 1; +} + +static void +decode_color_block (unsigned char *block, + unsigned char *src, + int format) +{ + int i, x, y; + unsigned char *d = block; + unsigned int indices, idx; + unsigned char colors[4][3]; + unsigned short c0, c1; + + c0 = GETL16(&src[0]); + c1 = GETL16(&src[2]); + + unpack_rgb565(colors[0], c0); + unpack_rgb565(colors[1], c1); + + if ((c0 > c1) || (format == DDS_COMPRESS_BC3)) + { + lerp_rgb13(colors[2], colors[0], colors[1]); + lerp_rgb13(colors[3], colors[1], colors[0]); + } + else + { + for (i = 0; i < 3; ++i) + { + colors[2][i] = (colors[0][i] + colors[1][i] + 1) >> 1; + colors[3][i] = 255; + } + } + + src += 4; + for (y = 0; y < 4; ++y) + { + indices = src[y]; + for (x = 0; x < 4; ++x) + { + idx = indices & 0x03; + d[0] = colors[idx][2]; + d[1] = colors[idx][1]; + d[2] = colors[idx][0]; + if (format == DDS_COMPRESS_BC1) + d[3] = ((c0 <= c1) && idx == 3) ? 0 : 255; + indices >>= 2; + d += 4; + } + } +} + +static void +decode_alpha_block_BC2 (unsigned char *block, + unsigned char *src) +{ + int x, y; + unsigned char *d = block; + unsigned int bits; + + for (y = 0; y < 4; ++y) + { + bits = GETL16(&src[2 * y]); + for (x = 0; x < 4; ++x) + { + d[0] = (bits & 0x0f) * 17; + bits >>= 4; + d += 4; + } + } +} + +static void +decode_alpha_block_BC3 (unsigned char *block, + unsigned char *src, + int w) +{ + int x, y, code; + unsigned char *d = block; + unsigned char a0 = src[0]; + unsigned char a1 = src[1]; + unsigned long long bits = GETL64(src) >> 16; + + for (y = 0; y < 4; ++y) + { + for (x = 0; x < 4; ++x) + { + code = ((unsigned int)bits) & 0x07; + if (code == 0) + d[0] = a0; + else if (code == 1) + d[0] = a1; + else if (a0 > a1) + d[0] = ((8 - code) * a0 + (code - 1) * a1) / 7; + else if (code >= 6) + d[0] = (code == 6) ? 0 : 255; + else + d[0] = ((6 - code) * a0 + (code - 1) * a1) / 5; + bits >>= 3; + d += 4; + } + + if (w < 4) + bits >>= (3 * (4 - w)); + } +} + +static void +make_normal (unsigned char *dst, + unsigned char x, + unsigned char y) +{ + float nx = 2.0f * ((float)x / 255.0f) - 1.0f; + float ny = 2.0f * ((float)y / 255.0f) - 1.0f; + float nz = 0.0f; + float d = 1.0f - nx * nx + ny * ny; + int z; + + if (d > 0) + nz = sqrtf(d); + + z = (int)(255.0f * (nz + 1) / 2.0f); + z = MAX(0, MIN(255, z)); + + dst[0] = x; + dst[1] = y; + dst[2] = z; +} + +static void +normalize_block (unsigned char *block, + int format) +{ + int x, y, tmp; + + for (y = 0; y < 4; ++y) + { + for (x = 0; x < 4; ++x) + { + if (format == DDS_COMPRESS_BC3) + { + tmp = block[y * 16 + (x * 4)]; + make_normal(&block[y * 16 + (x * 4)], + block[y * 16 + (x * 4) + 3], + block[y * 16 + (x * 4) + 1]); + block[y * 16 + (x * 4) + 3] = tmp; + } + else if (format == DDS_COMPRESS_BC5) + { + make_normal(&block[y * 16 + (x * 4)], + block[y * 16 + (x * 4)], + block[y * 16 + (x * 4) + 1]); + } + } + } +} + +static void +put_block (unsigned char *dst, + unsigned char *block, + unsigned int bx, + unsigned int by, + unsigned int width, + unsigned height, + int bpp) +{ + int x, y, i; + unsigned char *d; + + for (y = 0; y < 4 && ((by + y) < height); ++y) + { + d = dst + ((y + by) * width + bx) * bpp; + for (x = 0; x < 4 && ((bx + x) < width); ++x) + { + for (i = 0; i < bpp; ++ i) + *d++ = block[y * 16 + (x * 4) + i]; + } + } +} + +int +dxt_decompress (unsigned char *dst, + unsigned char *src, + int format, + unsigned int size, + unsigned int width, + unsigned int height, + int bpp, + int normals) +{ + unsigned char *s; + unsigned int x, y; + unsigned char block[16 * 4]; + + s = src; + + for (y = 0; y < height; y += 4) + { + for (x = 0; x < width; x += 4) + { + memset(block, 255, 16 * 4); + + if (format == DDS_COMPRESS_BC1) + { + decode_color_block(block, s, format); + s += 8; + } + else if (format == DDS_COMPRESS_BC2) + { + decode_alpha_block_BC2(block + 3, s); + decode_color_block(block, s + 8, format); + s += 16; + } + else if (format == DDS_COMPRESS_BC3) + { + decode_alpha_block_BC3(block + 3, s, width); + decode_color_block(block, s + 8, format); + s += 16; + } + else if (format == DDS_COMPRESS_BC4) + { + decode_alpha_block_BC3(block, s, width); + s += 8; + } + else if (format == DDS_COMPRESS_BC5) + { + decode_alpha_block_BC3(block, s + 8, width); + decode_alpha_block_BC3(block + 1, s, width); + s += 16; + } + + if (normals) + normalize_block(block, format); + + put_block(dst, block, x, y, width, height, bpp); + } + } + + return 1; +} |