1 files changed, 527 insertions, 0 deletions
diff --git a/gfx/2d/ssse3-scaler.c b/gfx/2d/ssse3-scaler.c
new file mode 100644
index 0000000000..4f7419cc80
--- /dev/null
+++ b/gfx/2d/ssse3-scaler.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright © 2013 Soren Sandmann Pedersen
+ * Copyright © 2013 Red Hat, Inc.
+ * Copyright © 2016 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann@gmail.com)
+ *         Jeff Muizelaar (jmuizelaar@mozilla.com)
+ */
+
+/* This has been adapted from the ssse3 code from pixman. It's currently
+ * a mess as I want to try it out in practice before finalizing the details.
+ */
+
+#include <stdlib.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <stdint.h>
+#include <assert.h>
+#include "ssse3-scaler.h"
+
+typedef int32_t pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t pixman_fixed_t;
+#define pixman_fixed_1 (pixman_int_to_fixed(1))
+#define pixman_fixed_to_int(f) ((int)((f) >> 16))
+#define pixman_int_to_fixed(i) ((pixman_fixed_t)((i) << 16))
+#define pixman_double_to_fixed(d) ((pixman_fixed_t)((d)*65536.0))
+#define PIXMAN_FIXED_INT_MAX 32767
+#define PIXMAN_FIXED_INT_MIN -32768
+typedef struct pixman_vector pixman_vector_t;
+
+typedef int pixman_bool_t;
+typedef int64_t pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t pixman_fixed_48_16_t;
+typedef struct {
+  pixman_fixed_48_16_t v[3];
+} pixman_vector_48_16_t;
+
+struct pixman_vector {
+  pixman_fixed_t vector[3];
+};
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_transform {
+  pixman_fixed_t matrix[3][3];
+};
+
+#ifdef _MSC_VER
+#  define force_inline __forceinline
+#else
+#  define force_inline __inline__ __attribute__((always_inline))
+#endif
+
+#define BILINEAR_INTERPOLATION_BITS 6
+
+static force_inline int pixman_fixed_to_bilinear_weight(pixman_fixed_t x) {
+  return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+         ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
+static void pixman_transform_point_31_16_3d(const pixman_transform_t* t,
+                                            const pixman_vector_48_16_t* v,
+                                            pixman_vector_48_16_t* result) {
+  int i;
+  int64_t tmp[3][2];
+
+  /* input vector values must have no more than 31 bits (including sign)
+   * in the integer part */
+  assert(v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
+  assert(v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+  assert(v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
+  assert(v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+  assert(v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
+  assert(v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+  for (i = 0; i < 3; i++) {
+    tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+    tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+    tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+    tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+    tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+    tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+  }
+
+  result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+  result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+  result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
+static pixman_bool_t pixman_transform_point_3d(
+    const struct pixman_transform* transform, struct pixman_vector* vector) {
+  pixman_vector_48_16_t tmp;
+  tmp.v[0] = vector->vector[0];
+  tmp.v[1] = vector->vector[1];
+  tmp.v[2] = vector->vector[2];
+
+  pixman_transform_point_31_16_3d(transform, &tmp, &tmp);
+
+  vector->vector[0] = tmp.v[0];
+  vector->vector[1] = tmp.v[1];
+  vector->vector[2] = tmp.v[2];
+
+  return vector->vector[0] == tmp.v[0] && vector->vector[1] == tmp.v[1] &&
+         vector->vector[2] == tmp.v[2];
+}
+
+struct bits_image_t {
+  uint32_t* bits;
+  int rowstride;
+  pixman_transform_t* transform;
+};
+
+typedef struct bits_image_t bits_image_t;
+typedef struct {
+  int unused;
+} pixman_iter_info_t;
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef void (*pixman_iter_fini_t)(pixman_iter_t* iter);
+
+struct pixman_iter_t {
+  int x, y;
+  pixman_iter_fini_t fini;
+  bits_image_t* image;
+  uint32_t* buffer;
+  int width;
+  int height;
+  void* data;
+};
+
+typedef struct {
+  int y;
+  uint64_t* buffer;
+} line_t;
+
+typedef struct {
+  line_t lines[2];
+  pixman_fixed_t y;
+  pixman_fixed_t x;
+  uint64_t data[1];
+} bilinear_info_t;
+
+static void ssse3_fetch_horizontal(bits_image_t* image, line_t* line, int y,
+                                   pixman_fixed_t x, pixman_fixed_t ux, int n) {
+  uint32_t* bits = image->bits + y * image->rowstride;
+  __m128i vx = _mm_set_epi16(-(x + 1), x, -(x + 1), x, -(x + ux + 1), x + ux,
+                             -(x + ux + 1), x + ux);
+  __m128i vux = _mm_set_epi16(-2 * ux, 2 * ux, -2 * ux, 2 * ux, -2 * ux, 2 * ux,
+                              -2 * ux, 2 * ux);
+  __m128i vaddc = _mm_set_epi16(1, 0, 1, 0, 1, 0, 1, 0);
+  __m128i* b = (__m128i*)line->buffer;
+  __m128i vrl0, vrl1;
+
+  while ((n -= 2) >= 0) {
+    __m128i vw, vr, s;
+#ifdef HACKY_PADDING
+    if (pixman_fixed_to_int(x + ux) >= image->rowstride) {
+      vrl1 = _mm_setzero_si128();
+      printf("overread 2loop\n");
+    } else {
+      if (pixman_fixed_to_int(x + ux) < 0) printf("underflow\n");
+      vrl1 = _mm_loadl_epi64(
+          (__m128i*)(bits + (pixman_fixed_to_int(x + ux) < 0
+                                 ? 0
+                                 : pixman_fixed_to_int(x + ux))));
+    }
+#else
+    vrl1 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x + ux)));
+#endif
+    /* vrl1: R1, L1 */
+
+  final_pixel:
+#ifdef HACKY_PADDING
+    vrl0 = _mm_loadl_epi64(
+        (__m128i*)(bits +
+                   (pixman_fixed_to_int(x) < 0 ? 0 : pixman_fixed_to_int(x))));
+#else
+    vrl0 = _mm_loadl_epi64((__m128i*)(bits + pixman_fixed_to_int(x)));
+#endif
+    /* vrl0: R0, L0 */
+
+    /* The weights are based on vx which is a vector of
+     *
+     *    - (x + 1), x, - (x + 1), x,
+     *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+     *
+     * so the 16 bit weights end up like this:
+     *
+     *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
+     *
+     * and after shifting and packing, we get these bytes:
+     *
+     *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+     *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+     *
+     * which means the first and the second input pixel
+     * have to be interleaved like this:
+     *
+     *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+     *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+     *
+     * before maddubsw can be used.
+     */
+
+    vw = _mm_add_epi16(vaddc,
+                       _mm_srli_epi16(vx, 16 - BILINEAR_INTERPOLATION_BITS));
+    /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+     */
+
+    vw = _mm_packus_epi16(vw, vw);
+    /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+     *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
+     */
+    vx = _mm_add_epi16(vx, vux);
+
+    x += 2 * ux;
+
+    vr = _mm_unpacklo_epi16(vrl1, vrl0);
+    /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+    s = _mm_shuffle_epi32(vr, _MM_SHUFFLE(1, 0, 3, 2));
+    /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+    vr = _mm_unpackhi_epi8(vr, s);
+    /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+     *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+     */
+
+    vr = _mm_maddubs_epi16(vr, vw);
+
+    /* When the weight is 0, the inverse weight is
+     * 128 which can't be represented in a signed byte.
+     * As a result maddubsw computes the following:
+     *
+     *     r = l * -128 + r * 0
+     *
+     * rather than the desired
+     *
+     *     r = l * 128 + r * 0
+     *
+     * We fix this by taking the absolute value of the
+     * result.
+     */
+    // we can drop this if we use lower precision
+
+    vr = _mm_shuffle_epi32(vr, _MM_SHUFFLE(2, 0, 3, 1));
+    /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+    _mm_store_si128(b++, vr);
+  }
+
+  if (n == -1) {
+    vrl1 = _mm_setzero_si128();
+    goto final_pixel;
+  }
+
+  line->y = y;
+}
+
+// scale a line of destination pixels
+static uint32_t* ssse3_fetch_bilinear_cover(pixman_iter_t* iter,
+                                            const uint32_t* mask) {
+  pixman_fixed_t fx, ux;
+  bilinear_info_t* info = iter->data;
+  line_t *line0, *line1;
+  int y0, y1;
+  int32_t dist_y;
+  __m128i vw, uvw;
+  int i;
+
+  fx = info->x;
+  ux = iter->image->transform->matrix[0][0];
+
+  y0 = pixman_fixed_to_int(info->y);
+  if (y0 < 0) *(volatile char*)0 = 9;
+  y1 = y0 + 1;
+
+  // clamping in y direction
+  if (y1 >= iter->height) {
+    y1 = iter->height - 1;
+  }
+
+  line0 = &info->lines[y0 & 0x01];
+  line1 = &info->lines[y1 & 0x01];
+
+  if (line0->y != y0) {
+    ssse3_fetch_horizontal(iter->image, line0, y0, fx, ux, iter->width);
+  }
+
+  if (line1->y != y1) {
+    ssse3_fetch_horizontal(iter->image, line1, y1, fx, ux, iter->width);
+  }
+
+#ifdef PIXMAN_STYLE_INTERPOLATION
+  dist_y = pixman_fixed_to_bilinear_weight(info->y);
+  dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+  vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
+                     dist_y);
+
+#else
+  // setup the weights for the top (vw) and bottom (uvw) lines
+  dist_y = pixman_fixed_to_bilinear_weight(info->y);
+  // we use 15 instead of 16 because we need an extra bit to handle when the
+  // weights are 0 and 1
+  dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
+
+  vw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
+                     dist_y);
+
+  dist_y = (1 << BILINEAR_INTERPOLATION_BITS) -
+           pixman_fixed_to_bilinear_weight(info->y);
+  dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
+  uvw = _mm_set_epi16(dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y,
+                      dist_y);
+#endif
+
+  for (i = 0; i + 3 < iter->width; i += 4) {
+    __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i));
+    __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i));
+    __m128i top1 = _mm_load_si128((__m128i*)(line0->buffer + i + 2));
+    __m128i bot1 = _mm_load_si128((__m128i*)(line1->buffer + i + 2));
+#ifdef PIXMAN_STYLE_INTERPOLATION
+    __m128i r0, r1, tmp, p;
+
+    r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw);
+    tmp = _mm_cmplt_epi16(bot0, top0);
+    tmp = _mm_and_si128(tmp, vw);
+    r0 = _mm_sub_epi16(r0, tmp);
+    r0 = _mm_add_epi16(r0, top0);
+    r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS);
+    /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+    // r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+    /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+    // tmp = bot1 < top1 ? vw : 0;
+    // r1 = (bot1 - top1)*vw + top1 - tmp
+    // r1 = bot1*vw - vw*top1 + top1 - tmp
+    // r1 = bot1*vw + top1 - vw*top1 - tmp
+    // r1 = bot1*vw + top1*(1 - vw) - tmp
+    r1 = _mm_mulhi_epu16(_mm_sub_epi16(bot1, top1), vw);
+    tmp = _mm_cmplt_epi16(bot1, top1);
+    tmp = _mm_and_si128(tmp, vw);
+    r1 = _mm_sub_epi16(r1, tmp);
+    r1 = _mm_add_epi16(r1, top1);
+    r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS);
+    // r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+    /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+#else
+    __m128i r0, r1, p;
+    top0 = _mm_mulhi_epu16(top0, uvw);
+    bot0 = _mm_mulhi_epu16(bot0, vw);
+    r0 = _mm_add_epi16(top0, bot0);
+    r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS - 1);
+
+    top1 = _mm_mulhi_epu16(top1, uvw);
+    bot1 = _mm_mulhi_epu16(bot1, vw);
+    r1 = _mm_add_epi16(top1, bot1);
+    r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS - 1);
+#endif
+
+    p = _mm_packus_epi16(r0, r1);
+    _mm_storeu_si128((__m128i*)(iter->buffer + i), p);
+  }
+
+  while (i < iter->width) {
+    __m128i top0 = _mm_load_si128((__m128i*)(line0->buffer + i));
+    __m128i bot0 = _mm_load_si128((__m128i*)(line1->buffer + i));
+
+#ifdef PIXMAN_STYLE_INTERPOLATION
+    __m128i r0, tmp, p;
+    r0 = _mm_mulhi_epu16(_mm_sub_epi16(bot0, top0), vw);
+    tmp = _mm_cmplt_epi16(bot0, top0);
+    tmp = _mm_and_si128(tmp, vw);
+    r0 = _mm_sub_epi16(r0, tmp);
+    r0 = _mm_add_epi16(r0, top0);
+    r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS);
+    /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+    r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 0, 3, 1));
+    /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+#else
+    __m128i r0, p;
+    top0 = _mm_mulhi_epu16(top0, uvw);
+    bot0 = _mm_mulhi_epu16(bot0, vw);
+    r0 = _mm_add_epi16(top0, bot0);
+    r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS - 1);
+#endif
+
+    p = _mm_packus_epi16(r0, r0);
+
+    if (iter->width - i == 1) {
+      *(uint32_t*)(iter->buffer + i) = _mm_cvtsi128_si32(p);
+      i++;
+    } else {
+      _mm_storel_epi64((__m128i*)(iter->buffer + i), p);
+      i += 2;
+    }
+  }
+
+  info->y += iter->image->transform->matrix[1][1];
+
+  return iter->buffer;
+}
+
+static void ssse3_bilinear_cover_iter_fini(pixman_iter_t* iter) {
+  free(iter->data);
+}
+
+static void ssse3_bilinear_cover_iter_init(pixman_iter_t* iter) {
+  int width = iter->width;
+  bilinear_info_t* info;
+  pixman_vector_t v;
+
+  if (iter->x > PIXMAN_FIXED_INT_MAX || iter->x < PIXMAN_FIXED_INT_MIN ||
+      iter->y > PIXMAN_FIXED_INT_MAX || iter->y < PIXMAN_FIXED_INT_MIN)
+    goto fail;
+
+  /* Reference point is the center of the pixel */
+  v.vector[0] = pixman_int_to_fixed(iter->x) + pixman_fixed_1 / 2;
+  v.vector[1] = pixman_int_to_fixed(iter->y) + pixman_fixed_1 / 2;
+  v.vector[2] = pixman_fixed_1;
+
+  if (!pixman_transform_point_3d(iter->image->transform, &v)) goto fail;
+
+  info = malloc(sizeof(*info) + (2 * width - 1) * sizeof(uint64_t) + 64);
+  if (!info) goto fail;
+
+  info->x = v.vector[0] - pixman_fixed_1 / 2;
+  info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr) ((void*)((((uintptr_t)(addr)) + 15) & (~15)))
+
+  /* It is safe to set the y coordinates to -1 initially
+   * because COVER_CLIP_BILINEAR ensures that we will only
+   * be asked to fetch lines in the [0, height) interval
+   */
+  info->lines[0].y = -1;
+  info->lines[0].buffer = ALIGN(&(info->data[0]));
+  info->lines[1].y = -1;
+  info->lines[1].buffer = ALIGN(info->lines[0].buffer + width);
+
+  iter->fini = ssse3_bilinear_cover_iter_fini;
+
+  iter->data = info;
+  return;
+
+fail:
+  /* Something went wrong, either a bad matrix or OOM; in such cases,
+   * we don't guarantee any particular rendering.
+   */
+  iter->fini = NULL;
+}
+
+/* scale the src from src_width/height to dest_width/height drawn
+ * into the rectangle x,y width,height
+ * src_stride and dst_stride are 4 byte units */
+bool ssse3_scale_data(uint32_t* src, int src_width, int src_height,
+                      int src_stride, uint32_t* dest, int dest_width,
+                      int dest_height, int dest_stride, int x, int y, int width,
+                      int height) {
+  // XXX: assert(src_width > 1)
+  pixman_transform_t transform = {
+      {{pixman_fixed_1, 0, 0}, {0, pixman_fixed_1, 0}, {0, 0, pixman_fixed_1}}};
+  double width_scale = ((double)src_width) / dest_width;
+  double height_scale = ((double)src_height) / dest_height;
+#define AVOID_PADDING
+#ifdef AVOID_PADDING
+  // scale up by enough that we don't read outside of the bounds of the source
+  // surface currently this is required to avoid reading out of bounds.
+  if (width_scale < 1) {
+    width_scale = (double)(src_width - 1) / dest_width;
+    transform.matrix[0][2] = pixman_fixed_1 / 2;
+  }
+  if (height_scale < 1) {
+    height_scale = (double)(src_height - 1) / dest_height;
+    transform.matrix[1][2] = pixman_fixed_1 / 2;
+  }
+#endif
+  transform.matrix[0][0] = pixman_double_to_fixed(width_scale);
+  transform.matrix[1][1] = pixman_double_to_fixed(height_scale);
+  transform.matrix[2][2] = pixman_fixed_1;
+
+  bits_image_t image;
+  image.bits = src;
+  image.transform = &transform;
+  image.rowstride = src_stride;
+
+  pixman_iter_t iter;
+  iter.image = &image;
+  iter.x = x;
+  iter.y = y;
+  iter.width = width;
+  iter.height = src_height;
+  iter.buffer = dest;
+  iter.data = NULL;
+
+  ssse3_bilinear_cover_iter_init(&iter);
+
+  if (!iter.fini) return false;
+
+  if (iter.data) {
+    for (int iy = 0; iy < height; iy++) {
+      ssse3_fetch_bilinear_cover(&iter, NULL);
+      iter.buffer += dest_stride;
+    }
+    ssse3_bilinear_cover_iter_fini(&iter);
+  }
+  return true;
+}