/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "Blur.h" #include #include #include #include "mozilla/CheckedInt.h" #include "NumericTools.h" #include "2D.h" #include "DataSurfaceHelpers.h" #include "Tools.h" #ifdef USE_NEON # include "mozilla/arm.h" #endif namespace mozilla { namespace gfx { /** * Helper function to process each row of the box blur. * It takes care of transposing the data on input or output depending * on whether we intend a horizontal or vertical blur, and whether we're * reading from the initial source or writing to the final destination. * It allows starting or ending anywhere within the row to accomodate * a skip rect. */ template static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput, int32_t aLeftLobe, int32_t aRightLobe, int32_t aWidth, int32_t aStride, int32_t aStart, int32_t aEnd) { // If the input or output is transposed, then we will move down a row // for each step, instead of moving over a column. Since these values // only depend on a template parameter, they will more easily get // copy-propagated in the non-transposed case, which is why they // are not passed as parameters. const int32_t inputStep = aTransposeInput ? aStride : 1; const int32_t outputStep = aTransposeOutput ? aStride : 1; // We need to sample aLeftLobe pixels to the left and aRightLobe pixels // to the right of the current position, then average them. So this is // the size of the total width of this filter. const int32_t boxSize = aLeftLobe + aRightLobe + 1; // Instead of dividing the pixel sum by boxSize to average, we can just // compute a scale that will normalize the result so that it can be quickly // shifted into the desired range. const uint32_t reciprocal = (1 << 24) / boxSize; // The shift would normally truncate the result, whereas we would rather // prefer to round the result to the closest increment. By adding 0.5 units // to the initial sum, we bias the sum so that it will be rounded by the // truncation instead. uint32_t alphaSum = (boxSize + 1) / 2; // We process the row with a moving filter, keeping a sum (alphaSum) of // boxSize pixels. As we move over a pixel, we need to add on a pixel // from the right extreme of the window that moved into range, and subtract // off a pixel from the left extreme of window that moved out of range. // But first, we need to initialization alphaSum to the contents of // the window before we can get going. If the window moves out of bounds // of the row, we clamp each sample to be the closest pixel from within // row bounds, so the 0th and aWidth-1th pixel. int32_t initLeft = aStart - aLeftLobe; if (initLeft < 0) { // If the left lobe samples before the row, add in clamped samples. alphaSum += -initLeft * aInput[0]; initLeft = 0; } int32_t initRight = aStart + boxSize - aLeftLobe; if (initRight > aWidth) { // If the right lobe samples after the row, add in clamped samples. alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep]; initRight = aWidth; } // Finally, add in all the valid, non-clamped samples to fill up the // rest of the window. const uint8_t* src = &aInput[initLeft * inputStep]; const uint8_t* iterEnd = &aInput[initRight * inputStep]; #define INIT_ITER \ alphaSum += *src; \ src += inputStep; // We unroll the per-pixel loop here substantially. The amount of work // done per sample is so small that the cost of a loop condition check // and a branch can substantially add to or even dominate the performance // of the loop. while (src + 16 * inputStep <= iterEnd) { INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; INIT_ITER; } while (src < iterEnd) { INIT_ITER; } // Now we start moving the window over the row. We will be accessing // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be // out of bounds of the row. To avoid having to check within the inner // loops if we are in bound, we instead compute the points at which // we will move out of bounds of the row on the left side (splitLeft) // and right side (splitRight). int32_t splitLeft = std::min(std::max(aLeftLobe, aStart), aEnd); int32_t splitRight = std::min(std::max(aWidth - (boxSize - aLeftLobe), aStart), aEnd); // If the filter window is actually large than the size of the row, // there will be a middle area of overlap where the leftmost and rightmost // pixel of the filter will both be outside the row. In this case, we need // to invert the splits so that splitLeft <= splitRight. if (boxSize > aWidth) { std::swap(splitLeft, splitRight); } // Process all pixels up to splitLeft that would sample before the start of // the row. Note that because inputStep and outputStep may not be a const 1 // value, it is more performant to increment pointers here for the source and // destination rather than use a loop counter, since doing so would entail an // expensive multiplication that significantly slows down the loop. uint8_t* dst = &aOutput[aStart * outputStep]; iterEnd = &aOutput[splitLeft * outputStep]; src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep]; uint8_t firstVal = aInput[0]; #define LEFT_ITER \ *dst = (alphaSum * reciprocal) >> 24; \ alphaSum += *src - firstVal; \ dst += outputStep; \ src += inputStep; while (dst + 16 * outputStep <= iterEnd) { LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; LEFT_ITER; } while (dst < iterEnd) { LEFT_ITER; } // Process all pixels between splitLeft and splitRight. iterEnd = &aOutput[splitRight * outputStep]; if (boxSize <= aWidth) { // The filter window is smaller than the row size, so the leftmost and // rightmost samples are both within row bounds. src = &aInput[(splitLeft - aLeftLobe) * inputStep]; int32_t boxStep = boxSize * inputStep; #define CENTER_ITER \ *dst = (alphaSum * reciprocal) >> 24; \ alphaSum += src[boxStep] - *src; \ dst += outputStep; \ src += inputStep; while (dst + 16 * outputStep <= iterEnd) { CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; CENTER_ITER; } while (dst < iterEnd) { CENTER_ITER; } } else { // The filter window is larger than the row size, and we're in the area of // split overlap. So the leftmost and rightmost samples are both out of // bounds and need to be clamped. We can just precompute the difference here // consequently. int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0]; while (dst < iterEnd) { *dst = (alphaSum * reciprocal) >> 24; alphaSum += firstLastDiff; dst += outputStep; } } // Process all remaining pixels after splitRight that would sample after the // row end. iterEnd = &aOutput[aEnd * outputStep]; src = &aInput[(splitRight - aLeftLobe) * inputStep]; uint8_t lastVal = aInput[(aWidth - 1) * inputStep]; #define RIGHT_ITER \ *dst = (alphaSum * reciprocal) >> 24; \ alphaSum += lastVal - *src; \ dst += outputStep; \ src += inputStep; while (dst + 16 * outputStep <= iterEnd) { RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; RIGHT_ITER; } while (dst < iterEnd) { RIGHT_ITER; } } /** * Box blur involves looking at one pixel, and setting its value to the average * of its neighbouring pixels. This is meant to provide a 3-pass approximation * of a Gaussian blur. * @param aTranspose Whether to transpose the buffer when reading and writing * to it. * @param aData The buffer to be blurred. * @param aLobes The number of pixels to blend on the left and right for each of * 3 passes. * @param aWidth The number of columns in the buffers. * @param aRows The number of rows in the buffers. * @param aStride The stride of the buffer. */ template static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth, int32_t aRows, int32_t aStride, IntRect aSkipRect) { if (aTranspose) { std::swap(aWidth, aRows); aSkipRect.Swap(); } MOZ_ASSERT(aWidth > 0); // All three passes of the box blur that approximate the Gaussian are done // on each row in turn, so we only need two temporary row buffers to process // each row, instead of a full-sized buffer. Data moves from the source to the // first temporary, from the first temporary to the second, then from the // second back to the destination. This way is more cache-friendly than // processing whe whole buffer in each pass and thus yields a nice speedup. uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth]; if (!tmpRow) { return; } uint8_t* tmpRow2 = tmpRow + aWidth; const int32_t stride = aTranspose ? 1 : aStride; bool skipRectCoversWholeRow = 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost(); for (int32_t y = 0; y < aRows; y++) { // Check whether the skip rect intersects this row. If the skip // rect covers the whole surface in this row, we can avoid // this row entirely (and any others along the skip rect). bool inSkipRectY = aSkipRect.ContainsY(y); if (inSkipRectY && skipRectCoversWholeRow) { aData += stride * (aSkipRect.YMost() - y); y = aSkipRect.YMost() - 1; continue; } // Read in data from the source transposed if necessary. BoxBlurRow(aData, tmpRow, aLobes[0][0], aLobes[0][1], aWidth, aStride, 0, aWidth); // For the middle pass, the data is already pre-transposed and does not need // to be post-transposed yet. BoxBlurRow(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1], aWidth, aStride, 0, aWidth); // Write back data to the destination transposed if necessary too. // Make sure not to overwrite the skip rect by only outputting to the // destination before and after the skip rect, if requested. int32_t skipStart = inSkipRectY ? std::min(std::max(aSkipRect.X(), 0), aWidth) : aWidth; int32_t skipEnd = std::max(skipStart, aSkipRect.XMost()); if (skipStart > 0) { BoxBlurRow(tmpRow2, aData, aLobes[2][0], aLobes[2][1], aWidth, aStride, 0, skipStart); } if (skipEnd < aWidth) { BoxBlurRow(tmpRow2, aData, aLobes[2][0], aLobes[2][1], aWidth, aStride, skipEnd, aWidth); } aData += stride; } delete[] tmpRow; } static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) { int32_t major, minor, final; /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for * some notes about approximating the Gaussian blur with box-blurs. * The comments below are in the terminology of that page. */ int32_t z = aRadius / 3; switch (aRadius % 3) { case 0: // aRadius = z*3; choose d = 2*z + 1 major = minor = final = z; break; case 1: // aRadius = z*3 + 1 // This is a tricky case since there is no value of d which will // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1 // for some integer k, then the radius will be 3*k. If d is even, // i.e. d=2*k, then the radius will be 3*k - 1. // So we have to choose values that don't match the standard // algorithm. major = z + 1; minor = final = z; break; case 2: // aRadius = z*3 + 2; choose d = 2*z + 2 major = final = z + 1; minor = z; break; default: // Mathematical impossibility! MOZ_ASSERT(false); major = minor = final = 0; } MOZ_ASSERT(major + minor + final == aRadius); aLobes[0][0] = major; aLobes[0][1] = minor; aLobes[1][0] = minor; aLobes[1][1] = major; aLobes[2][0] = final; aLobes[2][1] = final; } static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius, int32_t aWidth, int32_t aRows, int32_t aStride, const IntRect& aSkipRect) { if (aRadius == 0) { memcpy(aOutput, aInput, aStride * aRows); return; } bool skipRectCoversWholeRow = 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost(); for (int32_t y = 0; y < aRows; y++) { // Check whether the skip rect intersects this row. If the skip // rect covers the whole surface in this row, we can avoid // this row entirely (and any others along the skip rect). bool inSkipRectY = aSkipRect.ContainsY(y); if (inSkipRectY && skipRectCoversWholeRow) { y = aSkipRect.YMost() - 1; continue; } for (int32_t x = 0; x < aWidth; x++) { // Check whether we are within the skip rect. If so, go // to the next point outside the skip rect. if (inSkipRectY && aSkipRect.ContainsX(x)) { x = aSkipRect.XMost(); if (x >= aWidth) break; } int32_t sMin = std::max(x - aRadius, 0); int32_t sMax = std::min(x + aRadius, aWidth - 1); int32_t v = 0; for (int32_t s = sMin; s <= sMax; ++s) { v = std::max(v, aInput[aStride * y + s]); } aOutput[aStride * y + x] = v; } } } static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius, int32_t aWidth, int32_t aRows, int32_t aStride, const IntRect& aSkipRect) { if (aRadius == 0) { memcpy(aOutput, aInput, aStride * aRows); return; } bool skipRectCoversWholeColumn = 0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost(); for (int32_t x = 0; x < aWidth; x++) { bool inSkipRectX = aSkipRect.ContainsX(x); if (inSkipRectX && skipRectCoversWholeColumn) { x = aSkipRect.XMost() - 1; continue; } for (int32_t y = 0; y < aRows; y++) { // Check whether we are within the skip rect. If so, go // to the next point outside the skip rect. if (inSkipRectX && aSkipRect.ContainsY(y)) { y = aSkipRect.YMost(); if (y >= aRows) break; } int32_t sMin = std::max(y - aRadius, 0); int32_t sMax = std::min(y + aRadius, aRows - 1); int32_t v = 0; for (int32_t s = sMin; s <= sMax; ++s) { v = std::max(v, aInput[aStride * s + x]); } aOutput[aStride * y + x] = v; } } } CheckedInt AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) { CheckedInt val(aVal); val += 3; val /= 4; val *= 4; return val; } AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius, const IntSize& aBlurRadius, const Rect* aDirtyRect, const Rect* aSkipRect) : mStride(0), mSurfaceAllocationSize(0) { Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect); } AlphaBoxBlur::AlphaBoxBlur() : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {} void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius, const IntSize& aBlurRadius, const Rect* aDirtyRect, const Rect* aSkipRect) { mSpreadRadius = aSpreadRadius; mBlurRadius = aBlurRadius; Rect rect(aRect); rect.Inflate(Size(aBlurRadius + aSpreadRadius)); rect.RoundOut(); if (aDirtyRect) { // If we get passed a dirty rect from layout, we can minimize the // shadow size and make painting faster. mHasDirtyRect = true; mDirtyRect = *aDirtyRect; Rect requiredBlurArea = mDirtyRect.Intersect(rect); requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius)); rect = requiredBlurArea.Intersect(rect); } else { mHasDirtyRect = false; } mRect = TruncatedToInt(rect); if (mRect.IsEmpty()) { return; } if (aSkipRect) { // If we get passed a skip rect, we can lower the amount of // blurring/spreading we need to do. We convert it to IntRect to avoid // expensive int<->float conversions if we were to use Rect instead. Rect skipRect = *aSkipRect; skipRect.Deflate(Size(aBlurRadius + aSpreadRadius)); mSkipRect = RoundedIn(skipRect); mSkipRect = mSkipRect.Intersect(mRect); if (mSkipRect.IsEqualInterior(mRect)) { return; } mSkipRect -= mRect.TopLeft(); // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our // accesses later are aligned as well, see bug 1622113. mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4)); mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4)); if (mSkipRect.IsEmpty()) { mSkipRect = IntRect(); } } else { mSkipRect = IntRect(); } CheckedInt stride = RoundUpToMultipleOf4(mRect.Width()); if (stride.isValid()) { mStride = stride.value(); // We need to leave room for an additional 3 bytes for a potential overrun // in our blurring code. size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3); if (size != 0) { mSurfaceAllocationSize = size; } } } AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX, float aSigmaY) : mRect(TruncatedToInt(aRect)), mSpreadRadius(), mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))), mStride(aStride), mSurfaceAllocationSize(0), mHasDirtyRect(false) { IntRect intRect; if (aRect.ToIntRect(&intRect)) { size_t minDataSize = BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height()); if (minDataSize != 0) { mSurfaceAllocationSize = minDataSize; } } } AlphaBoxBlur::~AlphaBoxBlur() = default; IntSize AlphaBoxBlur::GetSize() const { IntSize size(mRect.Width(), mRect.Height()); return size; } int32_t AlphaBoxBlur::GetStride() const { return mStride; } IntRect AlphaBoxBlur::GetRect() const { return mRect; } Rect* AlphaBoxBlur::GetDirtyRect() { if (mHasDirtyRect) { return &mDirtyRect; } return nullptr; } size_t AlphaBoxBlur::GetSurfaceAllocationSize() const { return mSurfaceAllocationSize; } void AlphaBoxBlur::Blur(uint8_t* aData) const { if (!aData) { return; } // no need to do all this if not blurring or spreading if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) { int32_t stride = GetStride(); IntSize size = GetSize(); if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) { // No need to use CheckedInt here - we have validated it in the // constructor. size_t szB = stride * size.height; uint8_t* tmpData = new (std::nothrow) uint8_t[szB]; if (!tmpData) { return; } memset(tmpData, 0, szB); SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width, size.height, stride, mSkipRect); SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width, size.height, stride, mSkipRect); delete[] tmpData; } int32_t horizontalLobes[3][2]; ComputeLobes(mBlurRadius.width, horizontalLobes); int32_t verticalLobes[3][2]; ComputeLobes(mBlurRadius.height, verticalLobes); // We want to allow for some extra space on the left for alignment reasons. int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value(); IntSize integralImageSize( size.width + maxLeftLobe + horizontalLobes[1][1], size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1); if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) { // Fallback to old blurring code when the surface is so large it may // overflow our integral image! if (mBlurRadius.width > 0) { BoxBlur(aData, horizontalLobes, size.width, size.height, stride, mSkipRect); } if (mBlurRadius.height > 0) { BoxBlur(aData, verticalLobes, size.width, size.height, stride, mSkipRect); } } else { size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width, 4); if (integralImageStride == 0) { return; } // We need to leave room for an additional 12 bytes for a maximum overrun // of 3 pixels in the blurring code. size_t bufLen = BufferSizeFromStrideAndHeight( integralImageStride, integralImageSize.height, 12); if (bufLen == 0) { return; } // bufLen is a byte count, but here we want a multiple of 32-bit ints, so // we divide by 4. AlignedArray integralImage((bufLen / 4) + ((bufLen % 4) ? 1 : 0)); if (!integralImage) { return; } #ifdef USE_SSE2 if (Factory::HasSSE2()) { BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0], verticalLobes[0][1], integralImage, integralImageStride); BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0], verticalLobes[1][1], integralImage, integralImageStride); BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0], verticalLobes[2][1], integralImage, integralImageStride); } else #endif #ifdef USE_NEON if (mozilla::supports_neon()) { BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0], verticalLobes[0][1], integralImage, integralImageStride); BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0], verticalLobes[1][1], integralImage, integralImageStride); BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0], verticalLobes[2][1], integralImage, integralImageStride); } else #endif { #ifdef _MIPS_ARCH_LOONGSON3A BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0], verticalLobes[0][1], integralImage, integralImageStride); BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0], verticalLobes[1][1], integralImage, integralImageStride); BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0], verticalLobes[2][1], integralImage, integralImageStride); #else BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0], verticalLobes[0][1], integralImage, integralImageStride); BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0], verticalLobes[1][1], integralImage, integralImageStride); BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0], verticalLobes[2][1], integralImage, integralImageStride); #endif } } } } MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest, const uint8_t* aSource, uint32_t* aPreviousRow, const uint32_t& aSourceWidth, const uint32_t& aLeftInflation, const uint32_t& aRightInflation) { uint32_t currentRowSum = 0; uint32_t pixel = aSource[0]; for (uint32_t x = 0; x < aLeftInflation; x++) { currentRowSum += pixel; *aDest++ = currentRowSum + *aPreviousRow++; } for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) { uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation)); #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__ currentRowSum += (alphaValues >> 24) & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; currentRowSum += (alphaValues >> 16) & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; currentRowSum += (alphaValues >> 8) & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; currentRowSum += alphaValues & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; #else currentRowSum += alphaValues & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; alphaValues >>= 8; currentRowSum += alphaValues & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; alphaValues >>= 8; currentRowSum += alphaValues & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; alphaValues >>= 8; currentRowSum += alphaValues & 0xff; *aDest++ = *aPreviousRow++ + currentRowSum; #endif } pixel = aSource[aSourceWidth - 1]; for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) { currentRowSum += pixel; *aDest++ = currentRowSum + *aPreviousRow++; } } MOZ_ALWAYS_INLINE void GenerateIntegralImage_C( int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation, int32_t aBottomInflation, uint32_t* aIntegralImage, size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride, const IntSize& aSize) { uint32_t stride32bit = aIntegralImageStride / 4; IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation, aSize.height + aTopInflation + aBottomInflation); memset(aIntegralImage, 0, aIntegralImageStride); GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width, aLeftInflation, aRightInflation); for (int y = 1; y < aTopInflation + 1; y++) { GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation); } for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) { GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation), aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation); } if (aBottomInflation) { for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) { GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride), aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation); } } } /** * Attempt to do an in-place box blur using an integral image. */ void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe, int32_t aBottomLobe, uint32_t* aIntegralImage, size_t aIntegralImageStride) const { IntSize size = GetSize(); MOZ_ASSERT(size.width > 0); // Our 'left' or 'top' lobe will include the current pixel. i.e. when // looking at an integral image the value of a pixel at 'x,y' is calculated // using the value of the integral image values above/below that. aLeftLobe++; aTopLobe++; int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe); MOZ_ASSERT(boxSize > 0); if (boxSize == 1) { return; } int32_t stride32bit = aIntegralImageStride / 4; int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value(); GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe, aIntegralImage, aIntegralImageStride, aData, mStride, size); uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize); uint32_t* innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation; // Storing these locally makes this about 30% faster! Presumably the compiler // can't be sure we're not altering the member variables in this loop. IntRect skipRect = mSkipRect; uint8_t* data = aData; int32_t stride = mStride; for (int32_t y = 0; y < size.height; y++) { // Not using ContainsY(y) because we do not skip y == skipRect.Y() // although that may not be done on purpose bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost(); uint32_t* topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe); uint32_t* topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe); uint32_t* bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe); uint32_t* bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe); for (int32_t x = 0; x < size.width; x++) { // Not using ContainsX(x) because we do not skip x == skipRect.X() // although that may not be done on purpose if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) { x = skipRect.XMost() - 1; // Trigger early jump on coming loop iterations, this will be reset // next line anyway. inSkipRectY = false; continue; } int32_t topLeft = topLeftBase[x]; int32_t topRight = topRightBase[x]; int32_t bottomRight = bottomRightBase[x]; int32_t bottomLeft = bottomLeftBase[x]; uint32_t value = bottomRight - topRight - bottomLeft; value += topLeft; data[stride * y + x] = (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32; } } } /** * Compute the box blur size (which we're calling the blur radius) from * the standard deviation. * * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for * approximating a Gaussian using box blurs. This yields quite a good * approximation for a Gaussian. Then we multiply this by 1.5 since our * code wants the radius of the entire triple-box-blur kernel instead of * the diameter of an individual box blur. For more details, see: * http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement * https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19 */ static const Float GAUSSIAN_SCALE_FACTOR = Float((3 * sqrt(2 * M_PI) / 4) * 1.5); IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) { IntSize size( static_cast(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)), static_cast(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f))); return size; } Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) { return aBlurRadius / GAUSSIAN_SCALE_FACTOR; } } // namespace gfx } // namespace mozilla