1 files changed, 904 insertions, 0 deletions
diff --git a/gfx/2d/Blur.cpp b/gfx/2d/Blur.cpp
new file mode 100644
index 0000000000..5de04f7174
--- /dev/null
+++ b/gfx/2d/Blur.cpp
@@ -0,0 +1,904 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Blur.h"
+
+#include <algorithm>
+#include <math.h>
+#include <string.h>
+
+#include "mozilla/CheckedInt.h"
+#include "NumericTools.h"
+
+#include "2D.h"
+#include "DataSurfaceHelpers.h"
+#include "Tools.h"
+
+#ifdef USE_NEON
+#  include "mozilla/arm.h"
+#endif
+
+namespace mozilla {
+namespace gfx {
+
+/**
+ * Helper function to process each row of the box blur.
+ * It takes care of transposing the data on input or output depending
+ * on whether we intend a horizontal or vertical blur, and whether we're
+ * reading from the initial source or writing to the final destination.
+ * It allows starting or ending anywhere within the row to accomodate
+ * a skip rect.
+ */
+template <bool aTransposeInput, bool aTransposeOutput>
+static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
+                              int32_t aLeftLobe, int32_t aRightLobe,
+                              int32_t aWidth, int32_t aStride, int32_t aStart,
+                              int32_t aEnd) {
+  // If the input or output is transposed, then we will move down a row
+  // for each step, instead of moving over a column. Since these values
+  // only depend on a template parameter, they will more easily get
+  // copy-propagated in the non-transposed case, which is why they
+  // are not passed as parameters.
+  const int32_t inputStep = aTransposeInput ? aStride : 1;
+  const int32_t outputStep = aTransposeOutput ? aStride : 1;
+
+  // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
+  // to the right of the current position, then average them. So this is
+  // the size of the total width of this filter.
+  const int32_t boxSize = aLeftLobe + aRightLobe + 1;
+
+  // Instead of dividing the pixel sum by boxSize to average, we can just
+  // compute a scale that will normalize the result so that it can be quickly
+  // shifted into the desired range.
+  const uint32_t reciprocal = (1 << 24) / boxSize;
+
+  // The shift would normally truncate the result, whereas we would rather
+  // prefer to round the result to the closest increment. By adding 0.5 units
+  // to the initial sum, we bias the sum so that it will be rounded by the
+  // truncation instead.
+  uint32_t alphaSum = (boxSize + 1) / 2;
+
+  // We process the row with a moving filter, keeping a sum (alphaSum) of
+  // boxSize pixels. As we move over a pixel, we need to add on a pixel
+  // from the right extreme of the window that moved into range, and subtract
+  // off a pixel from the left extreme of window that moved out of range.
+  // But first, we need to initialization alphaSum to the contents of
+  // the window before we can get going. If the window moves out of bounds
+  // of the row, we clamp each sample to be the closest pixel from within
+  // row bounds, so the 0th and aWidth-1th pixel.
+  int32_t initLeft = aStart - aLeftLobe;
+  if (initLeft < 0) {
+    // If the left lobe samples before the row, add in clamped samples.
+    alphaSum += -initLeft * aInput[0];
+    initLeft = 0;
+  }
+  int32_t initRight = aStart + boxSize - aLeftLobe;
+  if (initRight > aWidth) {
+    // If the right lobe samples after the row, add in clamped samples.
+    alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
+    initRight = aWidth;
+  }
+  // Finally, add in all the valid, non-clamped samples to fill up the
+  // rest of the window.
+  const uint8_t* src = &aInput[initLeft * inputStep];
+  const uint8_t* iterEnd = &aInput[initRight * inputStep];
+
+#define INIT_ITER   \
+  alphaSum += *src; \
+  src += inputStep;
+
+  // We unroll the per-pixel loop here substantially. The amount of work
+  // done per sample is so small that the cost of a loop condition check
+  // and a branch can substantially add to or even dominate the performance
+  // of the loop.
+  while (src + 16 * inputStep <= iterEnd) {
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+    INIT_ITER;
+  }
+  while (src < iterEnd) {
+    INIT_ITER;
+  }
+
+  // Now we start moving the window over the row. We will be accessing
+  // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
+  // out of bounds of the row. To avoid having to check within the inner
+  // loops if we are in bound, we instead compute the points at which
+  // we will move out of bounds of the row on the left side (splitLeft)
+  // and right side (splitRight).
+  int32_t splitLeft = std::min(std::max(aLeftLobe, aStart), aEnd);
+  int32_t splitRight =
+      std::min(std::max(aWidth - (boxSize - aLeftLobe), aStart), aEnd);
+  // If the filter window is actually large than the size of the row,
+  // there will be a middle area of overlap where the leftmost and rightmost
+  // pixel of the filter will both be outside the row. In this case, we need
+  // to invert the splits so that splitLeft <= splitRight.
+  if (boxSize > aWidth) {
+    std::swap(splitLeft, splitRight);
+  }
+
+  // Process all pixels up to splitLeft that would sample before the start of
+  // the row. Note that because inputStep and outputStep may not be a const 1
+  // value, it is more performant to increment pointers here for the source and
+  // destination rather than use a loop counter, since doing so would entail an
+  // expensive multiplication that significantly slows down the loop.
+  uint8_t* dst = &aOutput[aStart * outputStep];
+  iterEnd = &aOutput[splitLeft * outputStep];
+  src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
+  uint8_t firstVal = aInput[0];
+
+#define LEFT_ITER                       \
+  *dst = (alphaSum * reciprocal) >> 24; \
+  alphaSum += *src - firstVal;          \
+  dst += outputStep;                    \
+  src += inputStep;
+
+  while (dst + 16 * outputStep <= iterEnd) {
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+    LEFT_ITER;
+  }
+  while (dst < iterEnd) {
+    LEFT_ITER;
+  }
+
+  // Process all pixels between splitLeft and splitRight.
+  iterEnd = &aOutput[splitRight * outputStep];
+  if (boxSize <= aWidth) {
+    // The filter window is smaller than the row size, so the leftmost and
+    // rightmost samples are both within row bounds.
+    src = &aInput[(splitLeft - aLeftLobe) * inputStep];
+    int32_t boxStep = boxSize * inputStep;
+
+#define CENTER_ITER                     \
+  *dst = (alphaSum * reciprocal) >> 24; \
+  alphaSum += src[boxStep] - *src;      \
+  dst += outputStep;                    \
+  src += inputStep;
+
+    while (dst + 16 * outputStep <= iterEnd) {
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+      CENTER_ITER;
+    }
+    while (dst < iterEnd) {
+      CENTER_ITER;
+    }
+  } else {
+    // The filter window is larger than the row size, and we're in the area of
+    // split overlap. So the leftmost and rightmost samples are both out of
+    // bounds and need to be clamped. We can just precompute the difference here
+    // consequently.
+    int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
+    while (dst < iterEnd) {
+      *dst = (alphaSum * reciprocal) >> 24;
+      alphaSum += firstLastDiff;
+      dst += outputStep;
+    }
+  }
+
+  // Process all remaining pixels after splitRight that would sample after the
+  // row end.
+  iterEnd = &aOutput[aEnd * outputStep];
+  src = &aInput[(splitRight - aLeftLobe) * inputStep];
+  uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
+
+#define RIGHT_ITER                      \
+  *dst = (alphaSum * reciprocal) >> 24; \
+  alphaSum += lastVal - *src;           \
+  dst += outputStep;                    \
+  src += inputStep;
+
+  while (dst + 16 * outputStep <= iterEnd) {
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+    RIGHT_ITER;
+  }
+  while (dst < iterEnd) {
+    RIGHT_ITER;
+  }
+}
+
+/**
+ * Box blur involves looking at one pixel, and setting its value to the average
+ * of its neighbouring pixels. This is meant to provide a 3-pass approximation
+ * of a Gaussian blur.
+ * @param aTranspose Whether to transpose the buffer when reading and writing
+ *                   to it.
+ * @param aData The buffer to be blurred.
+ * @param aLobes The number of pixels to blend on the left and right for each of
+ *               3 passes.
+ * @param aWidth The number of columns in the buffers.
+ * @param aRows The number of rows in the buffers.
+ * @param aStride The stride of the buffer.
+ */
+template <bool aTranspose>
+static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
+                    int32_t aRows, int32_t aStride, IntRect aSkipRect) {
+  if (aTranspose) {
+    std::swap(aWidth, aRows);
+    aSkipRect.Swap();
+  }
+
+  MOZ_ASSERT(aWidth > 0);
+
+  // All three passes of the box blur that approximate the Gaussian are done
+  // on each row in turn, so we only need two temporary row buffers to process
+  // each row, instead of a full-sized buffer. Data moves from the source to the
+  // first temporary, from the first temporary to the second, then from the
+  // second back to the destination. This way is more cache-friendly than
+  // processing whe whole buffer in each pass and thus yields a nice speedup.
+  uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
+  if (!tmpRow) {
+    return;
+  }
+  uint8_t* tmpRow2 = tmpRow + aWidth;
+
+  const int32_t stride = aTranspose ? 1 : aStride;
+  bool skipRectCoversWholeRow =
+      0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
+
+  for (int32_t y = 0; y < aRows; y++) {
+    // Check whether the skip rect intersects this row. If the skip
+    // rect covers the whole surface in this row, we can avoid
+    // this row entirely (and any others along the skip rect).
+    bool inSkipRectY = aSkipRect.ContainsY(y);
+    if (inSkipRectY && skipRectCoversWholeRow) {
+      aData += stride * (aSkipRect.YMost() - y);
+      y = aSkipRect.YMost() - 1;
+      continue;
+    }
+
+    // Read in data from the source transposed if necessary.
+    BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
+                                  aWidth, aStride, 0, aWidth);
+
+    // For the middle pass, the data is already pre-transposed and does not need
+    // to be post-transposed yet.
+    BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
+                             aWidth, aStride, 0, aWidth);
+
+    // Write back data to the destination transposed if necessary too.
+    // Make sure not to overwrite the skip rect by only outputting to the
+    // destination before and after the skip rect, if requested.
+    int32_t skipStart =
+        inSkipRectY ? std::min(std::max(aSkipRect.X(), 0), aWidth) : aWidth;
+    int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
+    if (skipStart > 0) {
+      BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
+                                    aWidth, aStride, 0, skipStart);
+    }
+    if (skipEnd < aWidth) {
+      BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
+                                    aWidth, aStride, skipEnd, aWidth);
+    }
+
+    aData += stride;
+  }
+
+  delete[] tmpRow;
+}
+
+static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
+  int32_t major, minor, final;
+
+  /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
+   * some notes about approximating the Gaussian blur with box-blurs.
+   * The comments below are in the terminology of that page.
+   */
+  int32_t z = aRadius / 3;
+  switch (aRadius % 3) {
+    case 0:
+      // aRadius = z*3; choose d = 2*z + 1
+      major = minor = final = z;
+      break;
+    case 1:
+      // aRadius = z*3 + 1
+      // This is a tricky case since there is no value of d which will
+      // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
+      // for some integer k, then the radius will be 3*k. If d is even,
+      // i.e. d=2*k, then the radius will be 3*k - 1.
+      // So we have to choose values that don't match the standard
+      // algorithm.
+      major = z + 1;
+      minor = final = z;
+      break;
+    case 2:
+      // aRadius = z*3 + 2; choose d = 2*z + 2
+      major = final = z + 1;
+      minor = z;
+      break;
+    default:
+      // Mathematical impossibility!
+      MOZ_ASSERT(false);
+      major = minor = final = 0;
+  }
+  MOZ_ASSERT(major + minor + final == aRadius);
+
+  aLobes[0][0] = major;
+  aLobes[0][1] = minor;
+  aLobes[1][0] = minor;
+  aLobes[1][1] = major;
+  aLobes[2][0] = final;
+  aLobes[2][1] = final;
+}
+
+static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
+                             int32_t aWidth, int32_t aRows, int32_t aStride,
+                             const IntRect& aSkipRect) {
+  if (aRadius == 0) {
+    memcpy(aOutput, aInput, aStride * aRows);
+    return;
+  }
+
+  bool skipRectCoversWholeRow =
+      0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
+  for (int32_t y = 0; y < aRows; y++) {
+    // Check whether the skip rect intersects this row. If the skip
+    // rect covers the whole surface in this row, we can avoid
+    // this row entirely (and any others along the skip rect).
+    bool inSkipRectY = aSkipRect.ContainsY(y);
+    if (inSkipRectY && skipRectCoversWholeRow) {
+      y = aSkipRect.YMost() - 1;
+      continue;
+    }
+
+    for (int32_t x = 0; x < aWidth; x++) {
+      // Check whether we are within the skip rect. If so, go
+      // to the next point outside the skip rect.
+      if (inSkipRectY && aSkipRect.ContainsX(x)) {
+        x = aSkipRect.XMost();
+        if (x >= aWidth) break;
+      }
+
+      int32_t sMin = std::max(x - aRadius, 0);
+      int32_t sMax = std::min(x + aRadius, aWidth - 1);
+      int32_t v = 0;
+      for (int32_t s = sMin; s <= sMax; ++s) {
+        v = std::max<int32_t>(v, aInput[aStride * y + s]);
+      }
+      aOutput[aStride * y + x] = v;
+    }
+  }
+}
+
+static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
+                           int32_t aWidth, int32_t aRows, int32_t aStride,
+                           const IntRect& aSkipRect) {
+  if (aRadius == 0) {
+    memcpy(aOutput, aInput, aStride * aRows);
+    return;
+  }
+
+  bool skipRectCoversWholeColumn =
+      0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
+  for (int32_t x = 0; x < aWidth; x++) {
+    bool inSkipRectX = aSkipRect.ContainsX(x);
+    if (inSkipRectX && skipRectCoversWholeColumn) {
+      x = aSkipRect.XMost() - 1;
+      continue;
+    }
+
+    for (int32_t y = 0; y < aRows; y++) {
+      // Check whether we are within the skip rect. If so, go
+      // to the next point outside the skip rect.
+      if (inSkipRectX && aSkipRect.ContainsY(y)) {
+        y = aSkipRect.YMost();
+        if (y >= aRows) break;
+      }
+
+      int32_t sMin = std::max(y - aRadius, 0);
+      int32_t sMax = std::min(y + aRadius, aRows - 1);
+      int32_t v = 0;
+      for (int32_t s = sMin; s <= sMax; ++s) {
+        v = std::max<int32_t>(v, aInput[aStride * s + x]);
+      }
+      aOutput[aStride * y + x] = v;
+    }
+  }
+}
+
+CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
+  CheckedInt<int32_t> val(aVal);
+
+  val += 3;
+  val /= 4;
+  val *= 4;
+
+  return val;
+}
+
+AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
+                           const IntSize& aBlurRadius, const Rect* aDirtyRect,
+                           const Rect* aSkipRect)
+    : mStride(0), mSurfaceAllocationSize(0) {
+  Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
+}
+
+AlphaBoxBlur::AlphaBoxBlur()
+    : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
+
+void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
+                        const IntSize& aBlurRadius, const Rect* aDirtyRect,
+                        const Rect* aSkipRect) {
+  mSpreadRadius = aSpreadRadius;
+  mBlurRadius = aBlurRadius;
+
+  Rect rect(aRect);
+  rect.Inflate(Size(aBlurRadius + aSpreadRadius));
+  rect.RoundOut();
+
+  if (aDirtyRect) {
+    // If we get passed a dirty rect from layout, we can minimize the
+    // shadow size and make painting faster.
+    mHasDirtyRect = true;
+    mDirtyRect = *aDirtyRect;
+    Rect requiredBlurArea = mDirtyRect.Intersect(rect);
+    requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
+    rect = requiredBlurArea.Intersect(rect);
+  } else {
+    mHasDirtyRect = false;
+  }
+
+  mRect = TruncatedToInt(rect);
+  if (mRect.IsEmpty()) {
+    return;
+  }
+
+  if (aSkipRect) {
+    // If we get passed a skip rect, we can lower the amount of
+    // blurring/spreading we need to do. We convert it to IntRect to avoid
+    // expensive int<->float conversions if we were to use Rect instead.
+    Rect skipRect = *aSkipRect;
+    skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
+    mSkipRect = RoundedIn(skipRect);
+    mSkipRect = mSkipRect.Intersect(mRect);
+    if (mSkipRect.IsEqualInterior(mRect)) {
+      return;
+    }
+
+    mSkipRect -= mRect.TopLeft();
+    // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
+    // accesses later are aligned as well, see bug 1622113.
+    mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
+    mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
+    if (mSkipRect.IsEmpty()) {
+      mSkipRect = IntRect();
+    }
+  } else {
+    mSkipRect = IntRect();
+  }
+
+  CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
+  if (stride.isValid()) {
+    mStride = stride.value();
+
+    // We need to leave room for an additional 3 bytes for a potential overrun
+    // in our blurring code.
+    size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
+    if (size != 0) {
+      mSurfaceAllocationSize = size;
+    }
+  }
+}
+
+AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
+                           float aSigmaY)
+    : mRect(TruncatedToInt(aRect)),
+      mSpreadRadius(),
+      mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
+      mStride(aStride),
+      mSurfaceAllocationSize(0),
+      mHasDirtyRect(false) {
+  IntRect intRect;
+  if (aRect.ToIntRect(&intRect)) {
+    size_t minDataSize =
+        BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
+    if (minDataSize != 0) {
+      mSurfaceAllocationSize = minDataSize;
+    }
+  }
+}
+
+AlphaBoxBlur::~AlphaBoxBlur() = default;
+
+IntSize AlphaBoxBlur::GetSize() const {
+  IntSize size(mRect.Width(), mRect.Height());
+  return size;
+}
+
+int32_t AlphaBoxBlur::GetStride() const { return mStride; }
+
+IntRect AlphaBoxBlur::GetRect() const { return mRect; }
+
+Rect* AlphaBoxBlur::GetDirtyRect() {
+  if (mHasDirtyRect) {
+    return &mDirtyRect;
+  }
+
+  return nullptr;
+}
+
+size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
+  return mSurfaceAllocationSize;
+}
+
+void AlphaBoxBlur::Blur(uint8_t* aData) const {
+  if (!aData) {
+    return;
+  }
+
+  // no need to do all this if not blurring or spreading
+  if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
+    int32_t stride = GetStride();
+
+    IntSize size = GetSize();
+
+    if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
+      // No need to use CheckedInt here - we have validated it in the
+      // constructor.
+      size_t szB = stride * size.height;
+      uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
+
+      if (!tmpData) {
+        return;
+      }
+
+      memset(tmpData, 0, szB);
+
+      SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
+                       size.height, stride, mSkipRect);
+      SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
+                     size.height, stride, mSkipRect);
+
+      delete[] tmpData;
+    }
+
+    int32_t horizontalLobes[3][2];
+    ComputeLobes(mBlurRadius.width, horizontalLobes);
+    int32_t verticalLobes[3][2];
+    ComputeLobes(mBlurRadius.height, verticalLobes);
+
+    // We want to allow for some extra space on the left for alignment reasons.
+    int32_t maxLeftLobe =
+        RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
+
+    IntSize integralImageSize(
+        size.width + maxLeftLobe + horizontalLobes[1][1],
+        size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
+
+    if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
+      // Fallback to old blurring code when the surface is so large it may
+      // overflow our integral image!
+      if (mBlurRadius.width > 0) {
+        BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
+                       mSkipRect);
+      }
+      if (mBlurRadius.height > 0) {
+        BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
+                      mSkipRect);
+      }
+    } else {
+      size_t integralImageStride =
+          GetAlignedStride<16>(integralImageSize.width, 4);
+      if (integralImageStride == 0) {
+        return;
+      }
+
+      // We need to leave room for an additional 12 bytes for a maximum overrun
+      // of 3 pixels in the blurring code.
+      size_t bufLen = BufferSizeFromStrideAndHeight(
+          integralImageStride, integralImageSize.height, 12);
+      if (bufLen == 0) {
+        return;
+      }
+      // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
+      // we divide by 4.
+      AlignedArray<uint32_t> integralImage((bufLen / 4) +
+                                           ((bufLen % 4) ? 1 : 0));
+
+      if (!integralImage) {
+        return;
+      }
+
+#ifdef USE_SSE2
+      if (Factory::HasSSE2()) {
+        BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
+                     verticalLobes[0][0], verticalLobes[0][1], integralImage,
+                     integralImageStride);
+        BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
+                     verticalLobes[1][0], verticalLobes[1][1], integralImage,
+                     integralImageStride);
+        BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
+                     verticalLobes[2][0], verticalLobes[2][1], integralImage,
+                     integralImageStride);
+      } else
+#endif
+#ifdef USE_NEON
+          if (mozilla::supports_neon()) {
+        BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
+                     verticalLobes[0][0], verticalLobes[0][1], integralImage,
+                     integralImageStride);
+        BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
+                     verticalLobes[1][0], verticalLobes[1][1], integralImage,
+                     integralImageStride);
+        BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
+                     verticalLobes[2][0], verticalLobes[2][1], integralImage,
+                     integralImageStride);
+      } else
+#endif
+      {
+#ifdef _MIPS_ARCH_LOONGSON3A
+        BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
+                    verticalLobes[0][0], verticalLobes[0][1], integralImage,
+                    integralImageStride);
+        BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
+                    verticalLobes[1][0], verticalLobes[1][1], integralImage,
+                    integralImageStride);
+        BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
+                    verticalLobes[2][0], verticalLobes[2][1], integralImage,
+                    integralImageStride);
+#else
+        BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
+                  verticalLobes[0][0], verticalLobes[0][1], integralImage,
+                  integralImageStride);
+        BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
+                  verticalLobes[1][0], verticalLobes[1][1], integralImage,
+                  integralImageStride);
+        BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
+                  verticalLobes[2][0], verticalLobes[2][1], integralImage,
+                  integralImageStride);
+#endif
+      }
+    }
+  }
+}
+
+MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
+                                           const uint8_t* aSource,
+                                           uint32_t* aPreviousRow,
+                                           const uint32_t& aSourceWidth,
+                                           const uint32_t& aLeftInflation,
+                                           const uint32_t& aRightInflation) {
+  uint32_t currentRowSum = 0;
+  uint32_t pixel = aSource[0];
+  for (uint32_t x = 0; x < aLeftInflation; x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+  for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
+       x += 4) {
+    uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
+#if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
+    currentRowSum += (alphaValues >> 24) & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    currentRowSum += (alphaValues >> 16) & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    currentRowSum += (alphaValues >> 8) & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    currentRowSum += alphaValues & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+#else
+    currentRowSum += alphaValues & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    alphaValues >>= 8;
+    currentRowSum += alphaValues & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    alphaValues >>= 8;
+    currentRowSum += alphaValues & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+    alphaValues >>= 8;
+    currentRowSum += alphaValues & 0xff;
+    *aDest++ = *aPreviousRow++ + currentRowSum;
+#endif
+  }
+  pixel = aSource[aSourceWidth - 1];
+  for (uint32_t x = (aSourceWidth + aLeftInflation);
+       x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+}
+
+MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
+    int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
+    int32_t aBottomInflation, uint32_t* aIntegralImage,
+    size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
+    const IntSize& aSize) {
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
+                            aSize.height + aTopInflation + aBottomInflation);
+
+  memset(aIntegralImage, 0, aIntegralImageStride);
+
+  GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
+                      aLeftInflation, aRightInflation);
+  for (int y = 1; y < aTopInflation + 1; y++) {
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
+                        aIntegralImage + (y - 1) * stride32bit, aSize.width,
+                        aLeftInflation, aRightInflation);
+  }
+
+  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit),
+                        aSource + aSourceStride * (y - aTopInflation),
+                        aIntegralImage + (y - 1) * stride32bit, aSize.width,
+                        aLeftInflation, aRightInflation);
+  }
+
+  if (aBottomInflation) {
+    for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
+         y++) {
+      GenerateIntegralRow(aIntegralImage + (y * stride32bit),
+                          aSource + ((aSize.height - 1) * aSourceStride),
+                          aIntegralImage + (y - 1) * stride32bit, aSize.width,
+                          aLeftInflation, aRightInflation);
+    }
+  }
+}
+
+/**
+ * Attempt to do an in-place box blur using an integral image.
+ */
+void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
+                             int32_t aRightLobe, int32_t aTopLobe,
+                             int32_t aBottomLobe, uint32_t* aIntegralImage,
+                             size_t aIntegralImageStride) const {
+  IntSize size = GetSize();
+
+  MOZ_ASSERT(size.width > 0);
+
+  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
+  // looking at an integral image the value of a pixel at 'x,y' is calculated
+  // using the value of the integral image values above/below that.
+  aLeftLobe++;
+  aTopLobe++;
+  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
+
+  MOZ_ASSERT(boxSize > 0);
+
+  if (boxSize == 1) {
+    return;
+  }
+
+  int32_t stride32bit = aIntegralImageStride / 4;
+
+  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
+
+  GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
+                          aIntegralImage, aIntegralImageStride, aData, mStride,
+                          size);
+
+  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
+
+  uint32_t* innerIntegral =
+      aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
+
+  // Storing these locally makes this about 30% faster! Presumably the compiler
+  // can't be sure we're not altering the member variables in this loop.
+  IntRect skipRect = mSkipRect;
+  uint8_t* data = aData;
+  int32_t stride = mStride;
+  for (int32_t y = 0; y < size.height; y++) {
+    // Not using ContainsY(y) because we do not skip y == skipRect.Y()
+    // although that may not be done on purpose
+    bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
+
+    uint32_t* topLeftBase =
+        innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
+    uint32_t* topRightBase =
+        innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
+    uint32_t* bottomRightBase =
+        innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
+    uint32_t* bottomLeftBase =
+        innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
+
+    for (int32_t x = 0; x < size.width; x++) {
+      // Not using ContainsX(x) because we do not skip x == skipRect.X()
+      // although that may not be done on purpose
+      if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
+        x = skipRect.XMost() - 1;
+        // Trigger early jump on coming loop iterations, this will be reset
+        // next line anyway.
+        inSkipRectY = false;
+        continue;
+      }
+      int32_t topLeft = topLeftBase[x];
+      int32_t topRight = topRightBase[x];
+      int32_t bottomRight = bottomRightBase[x];
+      int32_t bottomLeft = bottomLeftBase[x];
+
+      uint32_t value = bottomRight - topRight - bottomLeft;
+      value += topLeft;
+
+      data[stride * y + x] =
+          (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
+    }
+  }
+}
+
+/**
+ * Compute the box blur size (which we're calling the blur radius) from
+ * the standard deviation.
+ *
+ * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
+ * approximating a Gaussian using box blurs.  This yields quite a good
+ * approximation for a Gaussian.  Then we multiply this by 1.5 since our
+ * code wants the radius of the entire triple-box-blur kernel instead of
+ * the diameter of an individual box blur.  For more details, see:
+ *   http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
+ *   https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
+ */
+static const Float GAUSSIAN_SCALE_FACTOR =
+    Float((3 * sqrt(2 * M_PI) / 4) * 1.5);
+
+IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
+  IntSize size(
+      static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
+      static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
+
+  return size;
+}
+
+Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
+  return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
+}
+
+}  // namespace gfx
+}  // namespace mozilla