diff options
Diffstat (limited to 'gfx/ycbcr/yuv_row_win.cpp')
-rw-r--r-- | gfx/ycbcr/yuv_row_win.cpp | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp new file mode 100644 index 0000000000..c496b2d935 --- /dev/null +++ b/gfx/ycbcr/yuv_row_win.cpp @@ -0,0 +1,506 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "yuv_row.h" +#include "mozilla/SSE.h" + +#define kCoefficientsRgbU kCoefficientsRgbY + 2048 +#define kCoefficientsRgbV kCoefficientsRgbY + 4096 + +extern "C" { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) +#if defined(__clang__) +// clang-cl has a bug where it doesn't mangle names in inline asm +// so let's do the mangling in the preprocessor (ugh) +// (but we still need to declare a dummy extern for the parser) +extern void* _kCoefficientsRgbY; +#define kCoefficientsRgbY _kCoefficientsRgbY +#endif + +__declspec(naked) +void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp convertend + + convertloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movzx ebx, byte ptr [edx + 1] + movq mm1, [kCoefficientsRgbY + 8 * eax] + add edx, 2 + movq mm2, [kCoefficientsRgbY + 8 * ebx] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + convertend : + sub ecx, 2 + jns convertloop + + and ecx, 1 // odd number of pixels? + jz convertdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + convertdone : + + popad + ret + } +} + +__declspec(naked) +void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int ystep, + int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + mov ebx, [esp + 32 + 28] // uvstep + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + mov ebx, [esp + 32 + 24] // ystep + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + punpckldq mm1, mm1 + movntq [ebp], mm1 + + movzx ebx, byte ptr [edx + 1] + add edx, 2 + paddsw mm0, [kCoefficientsRgbY + 8 * ebx] + psraw mm0, 6 + packuswb mm0, mm0 + punpckldq mm0, mm0 + movntq [ebp+8], mm0 + add ebp, 16 + wend : + sub ecx, 4 + jns wloop + + add ecx, 4 + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + jmp wend1 + + wloop1 : + movd [ebp], mm1 + add ebp, 4 + wend1 : + sub ecx, 1 + jns wloop1 + wdone : + popad + ret + } +} + +// This version does general purpose scaling by any amount, up or down. +// The only thing it cannot do is rotation by 90 or 270. +// For performance the chroma is under-sampled, reducing cost of a 3x +// 1080p scale from 8.4 ms to 5.4 ms. +__declspec(naked) +void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x + jmp scaleend + + scaleloop : + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += source_dx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + scaleend : + sub ecx, 2 + jns scaleloop + + and ecx, 1 // odd number of pixels? + jz scaledone + + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 17 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + sar eax, 16 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + + scaledone : + popad + ret + } +} + +__declspec(naked) +void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + // [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + imul ecx, [esp + 32 + 24] // source_dx + mov [esp + 32 + 20], ecx // source_width = width * source_dx + mov ecx, [esp + 32 + 24] // source_dx + xor ebx, ebx // x = 0 + cmp ecx, 0x20000 + jl lscaleend + mov ebx, 0x8000 // x = 0.5 for 1/2 or less + jmp lscaleend +lscaleloop: + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [edi + eax] + movzx esi, byte ptr [edi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + movq mm0, [kCoefficientsRgbU + 8 * ecx] + + mov esi, [esp + 32 + 12] + mov eax, ebx + sar eax, 0x11 + + movzx ecx, byte ptr [esi + eax] + movzx esi, byte ptr [esi + eax + 1] + mov eax, ebx + and eax, 0x1fffe + imul esi, eax + xor eax, 0x1fffe + imul ecx, eax + add ecx, esi + shr ecx, 17 + paddsw mm0, [kCoefficientsRgbV + 8 * ecx] + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [1 + edx + eax] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm1, [kCoefficientsRgbY + 8 * ecx] + + cmp ebx, [esp + 32 + 20] + jge lscalelastpixel + + mov eax, ebx + sar eax, 0x10 + movzx ecx, byte ptr [edx + eax] + movzx esi, byte ptr [edx + eax + 1] + mov eax, ebx + add ebx, [esp + 32 + 24] + and eax, 0xffff + imul esi, eax + xor eax, 0xffff + imul ecx, eax + add ecx, esi + shr ecx, 16 + movq mm2, [kCoefficientsRgbY + 8 * ecx] + + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 0x6 + psraw mm2, 0x6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 0x8 + +lscaleend: + cmp ebx, [esp + 32 + 20] + jl lscaleloop + popad + ret + +lscalelastpixel: + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + popad + ret + }; +} +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + +void FastConvertYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); + return; + } +#endif + + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); +} + +void ScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { + +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); + return; + } +#endif + + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb_buf, + int width, + int source_dx) { +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) + if (mozilla::supports_sse()) { + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, + source_dx); + return; + } +#endif + + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); +} + +} // extern "C" |