summaryrefslogtreecommitdiffstats
path: root/media/libtheora/lib/x86
diff options
context:
space:
mode:
Diffstat (limited to 'media/libtheora/lib/x86')
-rw-r--r--media/libtheora/lib/x86/mmxfrag.c368
-rw-r--r--media/libtheora/lib/x86/mmxidct.c558
-rw-r--r--media/libtheora/lib/x86/mmxloop.h318
-rw-r--r--media/libtheora/lib/x86/mmxstate.c226
-rw-r--r--media/libtheora/lib/x86/sse2idct.c456
-rw-r--r--media/libtheora/lib/x86/sse2trans.h242
-rw-r--r--media/libtheora/lib/x86/x86cpu.c182
-rw-r--r--media/libtheora/lib/x86/x86cpu.h36
-rw-r--r--media/libtheora/lib/x86/x86int.h122
-rw-r--r--media/libtheora/lib/x86/x86state.c97
10 files changed, 2605 insertions, 0 deletions
diff --git a/media/libtheora/lib/x86/mmxfrag.c b/media/libtheora/lib/x86/mmxfrag.c
new file mode 100644
index 0000000000..b3ec508956
--- /dev/null
+++ b/media/libtheora/lib/x86/mmxfrag.c
@@ -0,0 +1,368 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+ Originally written by Rudolf Marek.
+ Additional optimization by Nils Pipenbrinck.
+ Note: Loops are unrolled for best performance.
+ The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+ do{ \
+ const unsigned char *src; \
+ unsigned char *dst; \
+ ptrdiff_t ystride3; \
+ src=(_src); \
+ dst=(_dst); \
+ __asm__ __volatile__( \
+ /*src+0*ystride*/ \
+ "movq (%[src]),%%mm0\n\t" \
+ /*src+1*ystride*/ \
+ "movq (%[src],%[ystride]),%%mm1\n\t" \
+ /*ystride3=ystride*3*/ \
+ "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+ /*src+2*ystride*/ \
+ "movq (%[src],%[ystride],2),%%mm2\n\t" \
+ /*src+3*ystride*/ \
+ "movq (%[src],%[ystride3]),%%mm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movq %%mm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movq %%mm1,(%[dst],%[ystride])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ /*dst+2*ystride*/ \
+ "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+ /*src+0*ystride*/ \
+ "movq (%[src]),%%mm0\n\t" \
+ /*src+1*ystride*/ \
+ "movq (%[src],%[ystride]),%%mm1\n\t" \
+ /*src+2*ystride*/ \
+ "movq (%[src],%[ystride],2),%%mm2\n\t" \
+ /*src+3*ystride*/ \
+ "movq (%[src],%[ystride3]),%%mm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movq %%mm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movq %%mm1,(%[dst],%[ystride])\n\t" \
+ /*dst+2*ystride*/ \
+ "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+ :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+ :[ystride]"r"((ptrdiff_t)(_ystride)) \
+ :"memory" \
+ ); \
+ } \
+ while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+ OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+}
+
+/*Copies the fragments specified by the lists of fragment indices from one
+ frame to another.
+ _dst_frame: The reference frame to copy to.
+ _src_frame: The reference frame to copy from.
+ _ystride: The row stride of the reference frames.
+ _fragis: A pointer to a list of fragment indices.
+ _nfragis: The number of fragment indices to copy.
+ _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+ ptrdiff_t fragii;
+ for(fragii=0;fragii<_nfragis;fragii++){
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+ OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+ _src_frame+frag_buf_off,_ystride);
+ }
+}
+
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+ __asm__ __volatile__(
+ /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+ "pcmpeqw %%mm0,%%mm0\n\t"
+ /*#0 Load low residue.*/
+ "movq 0*8(%[residue]),%%mm1\n\t"
+ /*#0 Load high residue.*/
+ "movq 1*8(%[residue]),%%mm2\n\t"
+ /*Set mm0 to 0x8000800080008000.*/
+ "psllw $15,%%mm0\n\t"
+ /*#1 Load low residue.*/
+ "movq 2*8(%[residue]),%%mm3\n\t"
+ /*#1 Load high residue.*/
+ "movq 3*8(%[residue]),%%mm4\n\t"
+ /*Set mm0 to 0x0080008000800080.*/
+ "psrlw $8,%%mm0\n\t"
+ /*#2 Load low residue.*/
+ "movq 4*8(%[residue]),%%mm5\n\t"
+ /*#2 Load high residue.*/
+ "movq 5*8(%[residue]),%%mm6\n\t"
+ /*#0 Bias low residue.*/
+ "paddsw %%mm0,%%mm1\n\t"
+ /*#0 Bias high residue.*/
+ "paddsw %%mm0,%%mm2\n\t"
+ /*#0 Pack to byte.*/
+ "packuswb %%mm2,%%mm1\n\t"
+ /*#1 Bias low residue.*/
+ "paddsw %%mm0,%%mm3\n\t"
+ /*#1 Bias high residue.*/
+ "paddsw %%mm0,%%mm4\n\t"
+ /*#1 Pack to byte.*/
+ "packuswb %%mm4,%%mm3\n\t"
+ /*#2 Bias low residue.*/
+ "paddsw %%mm0,%%mm5\n\t"
+ /*#2 Bias high residue.*/
+ "paddsw %%mm0,%%mm6\n\t"
+ /*#2 Pack to byte.*/
+ "packuswb %%mm6,%%mm5\n\t"
+ /*#0 Write row.*/
+ "movq %%mm1,(%[dst])\n\t"
+ /*#1 Write row.*/
+ "movq %%mm3,(%[dst],%[ystride])\n\t"
+ /*#2 Write row.*/
+ "movq %%mm5,(%[dst],%[ystride],2)\n\t"
+ /*#3 Load low residue.*/
+ "movq 6*8(%[residue]),%%mm1\n\t"
+ /*#3 Load high residue.*/
+ "movq 7*8(%[residue]),%%mm2\n\t"
+ /*#4 Load high residue.*/
+ "movq 8*8(%[residue]),%%mm3\n\t"
+ /*#4 Load high residue.*/
+ "movq 9*8(%[residue]),%%mm4\n\t"
+ /*#5 Load high residue.*/
+ "movq 10*8(%[residue]),%%mm5\n\t"
+ /*#5 Load high residue.*/
+ "movq 11*8(%[residue]),%%mm6\n\t"
+ /*#3 Bias low residue.*/
+ "paddsw %%mm0,%%mm1\n\t"
+ /*#3 Bias high residue.*/
+ "paddsw %%mm0,%%mm2\n\t"
+ /*#3 Pack to byte.*/
+ "packuswb %%mm2,%%mm1\n\t"
+ /*#4 Bias low residue.*/
+ "paddsw %%mm0,%%mm3\n\t"
+ /*#4 Bias high residue.*/
+ "paddsw %%mm0,%%mm4\n\t"
+ /*#4 Pack to byte.*/
+ "packuswb %%mm4,%%mm3\n\t"
+ /*#5 Bias low residue.*/
+ "paddsw %%mm0,%%mm5\n\t"
+ /*#5 Bias high residue.*/
+ "paddsw %%mm0,%%mm6\n\t"
+ /*#5 Pack to byte.*/
+ "packuswb %%mm6,%%mm5\n\t"
+ /*#3 Write row.*/
+ "movq %%mm1,(%[dst],%[ystride3])\n\t"
+ /*#4 Write row.*/
+ "movq %%mm3,(%[dst4])\n\t"
+ /*#5 Write row.*/
+ "movq %%mm5,(%[dst4],%[ystride])\n\t"
+ /*#6 Load low residue.*/
+ "movq 12*8(%[residue]),%%mm1\n\t"
+ /*#6 Load high residue.*/
+ "movq 13*8(%[residue]),%%mm2\n\t"
+ /*#7 Load low residue.*/
+ "movq 14*8(%[residue]),%%mm3\n\t"
+ /*#7 Load high residue.*/
+ "movq 15*8(%[residue]),%%mm4\n\t"
+ /*#6 Bias low residue.*/
+ "paddsw %%mm0,%%mm1\n\t"
+ /*#6 Bias high residue.*/
+ "paddsw %%mm0,%%mm2\n\t"
+ /*#6 Pack to byte.*/
+ "packuswb %%mm2,%%mm1\n\t"
+ /*#7 Bias low residue.*/
+ "paddsw %%mm0,%%mm3\n\t"
+ /*#7 Bias high residue.*/
+ "paddsw %%mm0,%%mm4\n\t"
+ /*#7 Pack to byte.*/
+ "packuswb %%mm4,%%mm3\n\t"
+ /*#6 Write row.*/
+ "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
+ /*#7 Write row.*/
+ "movq %%mm3,(%[dst4],%[ystride3])\n\t"
+ :
+ :[residue]"r"(_residue),
+ [dst]"r"(_dst),
+ [dst4]"r"(_dst+(_ystride<<2)),
+ [ystride]"r"((ptrdiff_t)_ystride),
+ [ystride3]"r"((ptrdiff_t)_ystride*3)
+ :"memory"
+ );
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+ int i;
+ /*Zero mm0.*/
+ __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+ for(i=4;i-->0;){
+ __asm__ __volatile__(
+ /*#0 Load source.*/
+ "movq (%[src]),%%mm3\n\t"
+ /*#1 Load source.*/
+ "movq (%[src],%[ystride]),%%mm7\n\t"
+ /*#0 Get copy of src.*/
+ "movq %%mm3,%%mm4\n\t"
+ /*#0 Expand high source.*/
+ "punpckhbw %%mm0,%%mm4\n\t"
+ /*#0 Expand low source.*/
+ "punpcklbw %%mm0,%%mm3\n\t"
+ /*#0 Add residue high.*/
+ "paddsw 8(%[residue]),%%mm4\n\t"
+ /*#1 Get copy of src.*/
+ "movq %%mm7,%%mm2\n\t"
+ /*#0 Add residue low.*/
+ "paddsw (%[residue]), %%mm3\n\t"
+ /*#1 Expand high source.*/
+ "punpckhbw %%mm0,%%mm2\n\t"
+ /*#0 Pack final row pixels.*/
+ "packuswb %%mm4,%%mm3\n\t"
+ /*#1 Expand low source.*/
+ "punpcklbw %%mm0,%%mm7\n\t"
+ /*#1 Add residue low.*/
+ "paddsw 16(%[residue]),%%mm7\n\t"
+ /*#1 Add residue high.*/
+ "paddsw 24(%[residue]),%%mm2\n\t"
+ /*Advance residue.*/
+ "lea 32(%[residue]),%[residue]\n\t"
+ /*#1 Pack final row pixels.*/
+ "packuswb %%mm2,%%mm7\n\t"
+ /*Advance src.*/
+ "lea (%[src],%[ystride],2),%[src]\n\t"
+ /*#0 Write row.*/
+ "movq %%mm3,(%[dst])\n\t"
+ /*#1 Write row.*/
+ "movq %%mm7,(%[dst],%[ystride])\n\t"
+ /*Advance dst.*/
+ "lea (%[dst],%[ystride],2),%[dst]\n\t"
+ :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
+ :[ystride]"r"((ptrdiff_t)_ystride)
+ :"memory"
+ );
+ }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+ int i;
+ /*Zero mm7.*/
+ __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+ for(i=4;i-->0;){
+ __asm__ __volatile__(
+ /*#0 Load src1.*/
+ "movq (%[src1]),%%mm0\n\t"
+ /*#0 Load src2.*/
+ "movq (%[src2]),%%mm2\n\t"
+ /*#0 Copy src1.*/
+ "movq %%mm0,%%mm1\n\t"
+ /*#0 Copy src2.*/
+ "movq %%mm2,%%mm3\n\t"
+ /*#1 Load src1.*/
+ "movq (%[src1],%[ystride]),%%mm4\n\t"
+ /*#0 Unpack lower src1.*/
+ "punpcklbw %%mm7,%%mm0\n\t"
+ /*#1 Load src2.*/
+ "movq (%[src2],%[ystride]),%%mm5\n\t"
+ /*#0 Unpack higher src1.*/
+ "punpckhbw %%mm7,%%mm1\n\t"
+ /*#0 Unpack lower src2.*/
+ "punpcklbw %%mm7,%%mm2\n\t"
+ /*#0 Unpack higher src2.*/
+ "punpckhbw %%mm7,%%mm3\n\t"
+ /*Advance src1 ptr.*/
+ "lea (%[src1],%[ystride],2),%[src1]\n\t"
+ /*Advance src2 ptr.*/
+ "lea (%[src2],%[ystride],2),%[src2]\n\t"
+ /*#0 Lower src1+src2.*/
+ "paddsw %%mm2,%%mm0\n\t"
+ /*#0 Higher src1+src2.*/
+ "paddsw %%mm3,%%mm1\n\t"
+ /*#1 Copy src1.*/
+ "movq %%mm4,%%mm2\n\t"
+ /*#0 Build lo average.*/
+ "psraw $1,%%mm0\n\t"
+ /*#1 Copy src2.*/
+ "movq %%mm5,%%mm3\n\t"
+ /*#1 Unpack lower src1.*/
+ "punpcklbw %%mm7,%%mm4\n\t"
+ /*#0 Build hi average.*/
+ "psraw $1,%%mm1\n\t"
+ /*#1 Unpack higher src1.*/
+ "punpckhbw %%mm7,%%mm2\n\t"
+ /*#0 low+=residue.*/
+ "paddsw (%[residue]),%%mm0\n\t"
+ /*#1 Unpack lower src2.*/
+ "punpcklbw %%mm7,%%mm5\n\t"
+ /*#0 high+=residue.*/
+ "paddsw 8(%[residue]),%%mm1\n\t"
+ /*#1 Unpack higher src2.*/
+ "punpckhbw %%mm7,%%mm3\n\t"
+ /*#1 Lower src1+src2.*/
+ "paddsw %%mm4,%%mm5\n\t"
+ /*#0 Pack and saturate.*/
+ "packuswb %%mm1,%%mm0\n\t"
+ /*#1 Higher src1+src2.*/
+ "paddsw %%mm2,%%mm3\n\t"
+ /*#0 Write row.*/
+ "movq %%mm0,(%[dst])\n\t"
+ /*#1 Build lo average.*/
+ "psraw $1,%%mm5\n\t"
+ /*#1 Build hi average.*/
+ "psraw $1,%%mm3\n\t"
+ /*#1 low+=residue.*/
+ "paddsw 16(%[residue]),%%mm5\n\t"
+ /*#1 high+=residue.*/
+ "paddsw 24(%[residue]),%%mm3\n\t"
+ /*#1 Pack and saturate.*/
+ "packuswb %%mm3,%%mm5\n\t"
+ /*#1 Write row ptr.*/
+ "movq %%mm5,(%[dst],%[ystride])\n\t"
+ /*Advance residue ptr.*/
+ "add $32,%[residue]\n\t"
+ /*Advance dest ptr.*/
+ "lea (%[dst],%[ystride],2),%[dst]\n\t"
+ :[dst]"+r"(_dst),[residue]"+r"(_residue),
+ [src1]"+r"(_src1),[src2]"+r"(_src2)
+ :[ystride]"r"((ptrdiff_t)_ystride)
+ :"memory"
+ );
+ }
+}
+
+void oc_restore_fpu_mmx(void){
+ __asm__ __volatile__("emms\n\t");
+}
+#endif
diff --git a/media/libtheora/lib/x86/mmxidct.c b/media/libtheora/lib/x86/mmxidct.c
new file mode 100644
index 0000000000..b8e3077066
--- /dev/null
+++ b/media/libtheora/lib/x86/mmxidct.c
@@ -0,0 +1,558 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of Theora's iDCT.
+ Originally written by Rudolf Marek, based on code from On2's VP3.*/
+#include "x86int.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*These are offsets into the table of constants below.*/
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (0)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET (56)
+
+
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN(_y,_x) \
+ "#OC_IDCT_BEGIN\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_J(5,_x)",%%mm7\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "paddw %%mm7,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_J(7,_x)",%%mm1\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "movq %%mm0,%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "paddw %%mm7,%%mm4\n\t" \
+ "pmulhw %%mm1,%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw %%mm7,%%mm3\n\t" \
+ "movq "OC_I(2,_x)",%%mm2\n\t" \
+ "pmulhw %%mm1,%%mm7\n\t" \
+ "paddw %%mm1,%%mm5\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
+ "psubw %%mm5,%%mm3\n\t" \
+ "movq "OC_J(6,_x)",%%mm5\n\t" \
+ "paddw %%mm7,%%mm0\n\t" \
+ "movq %%mm5,%%mm7\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "paddw %%mm7,%%mm5\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
+ "psubw %%mm5,%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "paddw %%mm2,%%mm7\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "movq "OC_J(4,_x)",%%mm3\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ "movq %%mm6,%%mm0\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm0,%%mm3\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "pmulhw %%mm3,%%mm4\n\t" \
+ "paddw %%mm0,%%mm6\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "paddw %%mm3,%%mm4\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "#end OC_IDCT_BEGIN\n\t" \
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT(_y,_x) \
+ "#OC_ROW_IDCT\n" \
+ OC_IDCT_BEGIN(_y,_x) \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r0=R0=G.+C.*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ "#end OC_ROW_IDCT\n\t" \
+
+/*The following macro does two 4x4 transposes in place.
+ At entry, we assume:
+ r0 = a3 a2 a1 a0
+ I(1) = b3 b2 b1 b0
+ r2 = c3 c2 c1 c0
+ r3 = d3 d2 d1 d0
+
+ r4 = e3 e2 e1 e0
+ r5 = f3 f2 f1 f0
+ r6 = g3 g2 g1 g0
+ r7 = h3 h2 h1 h0
+
+ At exit, we have:
+ I(0) = d0 c0 b0 a0
+ I(1) = d1 c1 b1 a1
+ I(2) = d2 c2 b2 a2
+ I(3) = d3 c3 b3 a3
+
+ J(4) = h0 g0 f0 e0
+ J(5) = h1 g1 f1 e1
+ J(6) = h2 g2 f2 e2
+ J(7) = h3 g3 f3 e3
+
+ I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+ J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
+
+ Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE(_y) \
+ "#OC_TRANSPOSE\n\t" \
+ "movq %%mm4,%%mm1\n\t" \
+ "punpcklwd %%mm5,%%mm4\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
+ "punpckhwd %%mm5,%%mm1\n\t" \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm7,%%mm6\n\t" \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm1,%%mm6\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
+ "punpckhwd %%mm7,%%mm0\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
+ "punpckhdq %%mm0,%%mm6\n\t" \
+ "movq "OC_I(0,_y)",%%mm4\n\t" \
+ "punpckldq %%mm0,%%mm1\n\t" \
+ "movq "OC_I(1,_y)",%%mm5\n\t" \
+ "movq %%mm4,%%mm0\n\t" \
+ "movq %%mm6,"OC_J(7,_y)"\n\t" \
+ "punpcklwd %%mm5,%%mm0\n\t" \
+ "movq %%mm1,"OC_J(6,_y)"\n\t" \
+ "punpckhwd %%mm5,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm3,%%mm2\n\t" \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm2,%%mm0\n\t" \
+ "punpckhdq %%mm2,%%mm1\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
+ "punpckhwd %%mm3,%%mm5\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ "punpckhdq %%mm5,%%mm4\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ "movq %%mm4,"OC_I(3,_y)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
+ "#end OC_TRANSPOSE\n\t" \
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT(_y) \
+ "#OC_COLUMN_IDCT\n" \
+ OC_IDCT_BEGIN(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
+ "#end OC_COLUMN_IDCT\n\t" \
+
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ int i;
+ /*This routine accepts an 8x8 matrix, but in partially transposed form.
+ Every 4x4 block is transposed.*/
+ __asm__ __volatile__(
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
+ OC_ROW_IDCT(y,x)
+ OC_TRANSPOSE(y)
+#undef OC_I
+#undef OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
+ OC_ROW_IDCT(y,x)
+ OC_TRANSPOSE(y)
+#undef OC_I
+#undef OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(y)
+#undef OC_I
+#undef OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(y)
+#undef OC_I
+#undef OC_J
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+ );
+ __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+ for(i=0;i<4;i++){
+ __asm__ __volatile__(
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+ :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+ );
+ }
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10(_y,_x) \
+ "#OC_IDCT_BEGIN_10\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
+ "nop\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "movq %%mm5,%%mm1\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "nop\n\t" \
+ "#end OC_IDCT_BEGIN_10\n\t" \
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10(_y,_x) \
+ "#OC_ROW_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10(_y,_x) \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ "#end OC_ROW_IDCT_10\n\t" \
+
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10(_y) \
+ "#OC_COLUMN_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
+ "#end OC_COLUMN_IDCT_10\n\t" \
+
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ __asm__ __volatile__(
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
+ /*Done with dequant, descramble, and partial transpose.
+ Now do the iDCT itself.*/
+ OC_ROW_IDCT_10(y,x)
+ OC_TRANSPOSE(y)
+#undef OC_I
+#undef OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(y)
+#undef OC_I
+#undef OC_J
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(y)
+#undef OC_I
+#undef OC_J
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+ );
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+ );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+ The input is assumed to be scaled by a factor of 4 relative to orthonormal
+ version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+ /*_last_zzi is subtly different from an actual count of the number of
+ coefficients we decoded for this block.
+ It contains the value of zzi BEFORE the final token in the block was
+ decoded.
+ In most cases this is an EOB token (the continuation of an EOB run from a
+ previous block counts), and so this is the same as the coefficient count.
+ However, in the case that the last token was NOT an EOB token, but filled
+ the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+ Provided the last token was not a pure zero run, the minimum value it can
+ be is 46, and so that doesn't affect any of the cases in this routine.
+ However, if the last token WAS a pure zero run of length 63, then _last_zzi
+ will be 1 while the number of coefficients decoded is 64.
+ Thus, we will trigger the following special case, where the real
+ coefficient count would not.
+ Note also that a zero run of length 64 will give _last_zzi a value of 0,
+ but we still process the DC coefficient, which might have a non-zero value
+ due to DC prediction.
+ Although convoluted, this is arguably the correct behavior: it allows us to
+ use a smaller transform when the block ends with a long zero run instead
+ of a normal EOB token.
+ It could be smarter... multiple separate zero runs at the end of a block
+ will fool it, but an encoder that generates these really deserves what it
+ gets.
+ Needless to say we inherited this approach from VP3.*/
+ /*Then perform the iDCT.*/
+ if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+ else oc_idct8x8_slow_mmx(_y,_x);
+}
+
+#endif
diff --git a/media/libtheora/lib/x86/mmxloop.h b/media/libtheora/lib/x86/mmxloop.h
new file mode 100644
index 0000000000..1f6090b567
--- /dev/null
+++ b/media/libtheora/lib/x86/mmxloop.h
@@ -0,0 +1,318 @@
+#if !defined(_x86_mmxloop_H)
+# define _x86_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+ On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+ mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX \
+ "#OC_LOOP_FILTER8_MMX\n\t" \
+ /*mm7=0*/ \
+ "pxor %%mm7,%%mm7\n\t" \
+ /*mm6:mm0={a0,...,a7}*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm0\n\t" \
+ "punpckhbw %%mm7,%%mm6\n\t" \
+ /*mm3:mm5={d0,...,d7}*/ \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm3\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+ "psubw %%mm3,%%mm0\n\t" \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*mm3:mm1={b0,...,b7}*/ \
+ "movq %%mm1,%%mm3\n\t" \
+ "punpcklbw %%mm7,%%mm1\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm3\n\t" \
+ /*mm5:mm4={c0,...,c7}*/ \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm7={3}x4 \
+ mm5:mm4={c0-b0,...,c7-b7}*/ \
+ "pcmpeqw %%mm7,%%mm7\n\t" \
+ "psubw %%mm1,%%mm4\n\t" \
+ "psrlw $14,%%mm7\n\t" \
+ "psubw %%mm3,%%mm5\n\t" \
+ /*Scale by 3.*/ \
+ "pmullw %%mm7,%%mm4\n\t" \
+ "pmullw %%mm7,%%mm5\n\t" \
+ /*mm7={4}x4 \
+ mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+ "psrlw $1,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "psllw $2,%%mm7\n\t" \
+ "movq (%[ll]),%%mm0\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*R_i has the range [-127,128], so we compute -R_i instead. \
+ mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ "psubw %%mm7,%%mm5\n\t" \
+ "psraw $3,%%mm4\n\t" \
+ "psraw $3,%%mm5\n\t" \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "packsswb %%mm5,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "pxor %%mm7,%%mm4\n\t" \
+ "packuswb %%mm3,%%mm1\n\t" \
+ /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+ /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+ we have to split things by sign (the other option is to work in 16 bits, \
+ but working in 8 bits gives much better parallelism). \
+ We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+ Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+ Finally, we split mm4 into positive and negative pieces using the mask in \
+ mm6, and add and subtract them as appropriate.*/ \
+ /*mm4=abs(-R_i)*/ \
+ /*mm7=255-2*L*/ \
+ "pcmpgtb %%mm4,%%mm6\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "pxor %%mm6,%%mm4\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "psubb %%mm6,%%mm4\n\t" \
+ /*mm7=255-max(2*L-abs(R_i),0)*/ \
+ "paddusb %%mm4,%%mm7\n\t" \
+ /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+ "paddusb %%mm7,%%mm4\n\t" \
+ "psubusb %%mm7,%%mm4\n\t" \
+ /*Now split mm4 by the original sign of -R_i.*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "pand %%mm6,%%mm4\n\t" \
+ "pandn %%mm5,%%mm6\n\t" \
+ /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+ /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+ "paddusb %%mm4,%%mm1\n\t" \
+ "psubusb %%mm4,%%mm2\n\t" \
+ "psubusb %%mm6,%%mm1\n\t" \
+ "paddusb %%mm6,%%mm2\n\t" \
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+ On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+ mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+ All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+ "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+ /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+ -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+ /*This first part is based on the transformation \
+ f = -(3*(c-b)+a-d+4>>3) \
+ = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+ = -(3*(c+~b)+(a+~d)-1016>>3) \
+ = 127-(3*(c+~b)+(a+~d)>>3) \
+ = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+ Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+ fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+ Using this, the last expression above can be computed in 8 bits of working \
+ precision via: \
+ u = ~pavgb(~b,c); \
+ v = pavgb(b,~c); \
+ This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+ m = u-v; \
+ t = m^pavgb(m^~a,m^d); \
+ f = 128+pavgb(pavgb(t,u),v); \
+ This required some careful analysis to ensure that carries are propagated \
+ correctly in all cases, but has been checked exhaustively.*/ \
+ /*input (a, b, c, d, ., ., ., .)*/ \
+ /*ff=0xFF; \
+ u=b; \
+ v=c; \
+ ll=255-2*L;*/ \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "movq (%[ll]),%%mm6\n\t" \
+ /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+ /*u^=ff; \
+ v^=ff;*/ \
+ "pxor %%mm7,%%mm4\n\t" \
+ "pxor %%mm7,%%mm5\n\t" \
+ /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+ /*u=pavgb(u,c); \
+ v=pavgb(v,b);*/ \
+ "pavgb %%mm2,%%mm4\n\t" \
+ "pavgb %%mm1,%%mm5\n\t" \
+ /*u^=ff; \
+ a^=ff;*/ \
+ "pxor %%mm7,%%mm4\n\t" \
+ "pxor %%mm7,%%mm0\n\t" \
+ /*m=u-v;*/ \
+ "psubb %%mm5,%%mm4\n\t" \
+ /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+ /*a^=m; \
+ d^=m;*/ \
+ "pxor %%mm4,%%mm0\n\t" \
+ "pxor %%mm4,%%mm3\n\t" \
+ /*t=pavgb(a,d);*/ \
+ "pavgb %%mm3,%%mm0\n\t" \
+ "psllw $7,%%mm7\n\t" \
+ /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+ /*t^=m; \
+ u=m+v;*/ \
+ "pxor %%mm4,%%mm0\n\t" \
+ "paddb %%mm5,%%mm4\n\t" \
+ /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+ /*f=pavgb(f,u); \
+ of=128;*/ \
+ "pavgb %%mm4,%%mm0\n\t" \
+ "packsswb %%mm7,%%mm7\n\t" \
+ /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+ /*f=pavgb(f,v);*/ \
+ "pavgb %%mm5,%%mm0\n\t" \
+ "movq %%mm7,%%mm3\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+ /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+ /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+ we have to split things by sign (the other option is to work in 16 bits, \
+ but staying in 8 bits gives much better parallelism).*/ \
+ /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+ This is the same number of instructions as computing a mask and splitting \
+ after the lflim computation, but has shorter dependency chains.*/ \
+ /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+ mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+ "psubusb %%mm0,%%mm3\n\t" \
+ "psubusb %%mm7,%%mm0\n\t" \
+ /*mm6=255-max(2*L-abs(R_i<0),0) \
+ mm4=255-max(2*L-abs(R_i>0),0)*/ \
+ "paddusb %%mm3,%%mm4\n\t" \
+ "paddusb %%mm0,%%mm6\n\t" \
+ /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+ mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+ "paddusb %%mm4,%%mm3\n\t" \
+ "paddusb %%mm6,%%mm0\n\t" \
+ "psubusb %%mm4,%%mm3\n\t" \
+ "psubusb %%mm6,%%mm0\n\t" \
+ /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+ /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+ "paddusb %%mm3,%%mm1\n\t" \
+ "psubusb %%mm3,%%mm2\n\t" \
+ "psubusb %%mm0,%%mm1\n\t" \
+ "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
+ do{ \
+ ptrdiff_t ystride3__; \
+ __asm__ __volatile__( \
+ /*mm0={a0,...,a7}*/ \
+ "movq (%[pix]),%%mm0\n\t" \
+ /*ystride3=_ystride*3*/ \
+ "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+ /*mm3={d0,...,d7}*/ \
+ "movq (%[pix],%[ystride3]),%%mm3\n\t" \
+ /*mm1={b0,...,b7}*/ \
+ "movq (%[pix],%[ystride]),%%mm1\n\t" \
+ /*mm2={c0,...,c7}*/ \
+ "movq (%[pix],%[ystride],2),%%mm2\n\t" \
+ _filter \
+ /*Write it back out.*/ \
+ "movq %%mm1,(%[pix],%[ystride])\n\t" \
+ "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
+ :[ystride3]"=&r"(ystride3__) \
+ :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
+ [ll]"r"(_ll) \
+ :"memory" \
+ ); \
+ } \
+ while(0)
+
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
+ do{ \
+ unsigned char *pix__; \
+ ptrdiff_t ystride3__; \
+ ptrdiff_t d__; \
+ pix__=(_pix)-2; \
+ __asm__ __volatile__( \
+ /*x x x x d0 c0 b0 a0*/ \
+ "movd (%[pix]),%%mm0\n\t" \
+ /*x x x x d1 c1 b1 a1*/ \
+ "movd (%[pix],%[ystride]),%%mm1\n\t" \
+ /*ystride3=_ystride*3*/ \
+ "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+ /*x x x x d2 c2 b2 a2*/ \
+ "movd (%[pix],%[ystride],2),%%mm2\n\t" \
+ /*x x x x d3 c3 b3 a3*/ \
+ "lea (%[pix],%[ystride],4),%[d]\n\t" \
+ "movd (%[pix],%[ystride3]),%%mm3\n\t" \
+ /*x x x x d4 c4 b4 a4*/ \
+ "movd (%[d]),%%mm4\n\t" \
+ /*x x x x d5 c5 b5 a5*/ \
+ "movd (%[d],%[ystride]),%%mm5\n\t" \
+ /*x x x x d6 c6 b6 a6*/ \
+ "movd (%[d],%[ystride],2),%%mm6\n\t" \
+ /*x x x x d7 c7 b7 a7*/ \
+ "movd (%[d],%[ystride3]),%%mm7\n\t" \
+ /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+ "punpcklbw %%mm1,%%mm0\n\t" \
+ /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+ "punpcklbw %%mm3,%%mm2\n\t" \
+ /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+ "movq %%mm0,%%mm3\n\t" \
+ /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+ "punpcklwd %%mm2,%%mm0\n\t" \
+ /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+ "punpckhwd %%mm2,%%mm3\n\t" \
+ /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+ "punpcklbw %%mm5,%%mm4\n\t" \
+ /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+ "punpcklbw %%mm7,%%mm6\n\t" \
+ /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+ "movq %%mm3,%%mm2\n\t" \
+ /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "punpckldq %%mm4,%%mm0\n\t" \
+ /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+ "punpckhdq %%mm4,%%mm1\n\t" \
+ /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ _filter \
+ /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+ "movq %%mm1,%%mm0\n\t" \
+ /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+ "punpcklbw %%mm2,%%mm1\n\t" \
+ /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+ "punpckhbw %%mm2,%%mm0\n\t" \
+ /*[d]=c1 b1 c0 b0*/ \
+ "movd %%mm1,%[d]\n\t" \
+ "movw %w[d],1(%[pix])\n\t" \
+ "psrlq $32,%%mm1\n\t" \
+ "shr $16,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride])\n\t" \
+ /*[d]=c3 b3 c2 b2*/ \
+ "movd %%mm1,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+ "shr $16,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+ "lea (%[pix],%[ystride],4),%[pix]\n\t" \
+ /*[d]=c5 b5 c4 b4*/ \
+ "movd %%mm0,%[d]\n\t" \
+ "movw %w[d],1(%[pix])\n\t" \
+ "psrlq $32,%%mm0\n\t" \
+ "shr $16,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride])\n\t" \
+ /*[d]=c7 b7 c6 b6*/ \
+ "movd %%mm0,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+ "shr $16,%[d]\n\t" \
+ "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+ :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
+ :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
+ :"memory" \
+ ); \
+ } \
+ while(0)
+
+# endif
+#endif
diff --git a/media/libtheora/lib/x86/mmxstate.c b/media/libtheora/lib/x86/mmxstate.c
new file mode 100644
index 0000000000..eebea14fba
--- /dev/null
+++ b/media/libtheora/lib/x86/mmxstate.c
@@ -0,0 +1,226 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+/*MMX acceleration of complete fragment reconstruction algorithm.
+ Originally written by Rudolf Marek.*/
+#include <string.h>
+#include "x86int.h"
+#include "mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+ unsigned char *dst;
+ ptrdiff_t frag_buf_off;
+ int ystride;
+ int refi;
+ /*Apply the inverse transform.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ /*Note that this value must be unsigned, to keep the __asm__ block from
+ sign-extending it when it puts it in a register.*/
+ ogg_uint16_t p;
+ int i;
+ /*We round this dequant product (and not any of the others) because there's
+ no iDCT rounding.*/
+ p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+ /*Fill _dct_coeffs with p.*/
+ __asm__ __volatile__(
+ /*mm0=0000 0000 0000 AAAA*/
+ "movd %[p],%%mm0\n\t"
+ /*mm0=0000 0000 AAAA AAAA*/
+ "punpcklwd %%mm0,%%mm0\n\t"
+ /*mm0=AAAA AAAA AAAA AAAA*/
+ "punpckldq %%mm0,%%mm0\n\t"
+ :
+ :[p]"r"((unsigned)p)
+ );
+ for(i=0;i<4;i++){
+ __asm__ __volatile__(
+ "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+ );
+ }
+ }
+ else{
+ /*Dequantize the DC coefficient.*/
+ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+ oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
+ }
+ /*Fill in the target buffer.*/
+ frag_buf_off=_state->frag_buf_offs[_fragi];
+ refi=_state->frags[_fragi].refi;
+ ystride=_state->ref_ystride[_pli];
+ dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+ if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
+ else{
+ const unsigned char *ref;
+ int mvoffsets[2];
+ ref=_state->ref_frame_data[refi]+frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+ _state->frag_mvs[_fragi])>1){
+ oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+ _dct_coeffs+64);
+ }
+ else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+ }
+}
+
+/*We copy these entire function to inline the actual MMX routines so that we
+ use only a single indirect call.*/
+
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+ memset(_bv,_flimit,8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+ The filter may be run on the bottom edge, affecting pixels in the next row of
+ fragments, so this row also needs to be available.
+ _bv: The bounding values array.
+ _refi: The index of the frame buffer to filter.
+ _pli: The color plane to filter.
+ _fragy0: The Y coordinate of the first fragment row to filter.
+ _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ OC_ALIGN8(unsigned char ll[8]);
+ const oc_fragment_plane *fplane;
+ const oc_fragment *frags;
+ const ptrdiff_t *frag_buf_offs;
+ unsigned char *ref_frame_data;
+ ptrdiff_t fragi_top;
+ ptrdiff_t fragi_bot;
+ ptrdiff_t fragi0;
+ ptrdiff_t fragi0_end;
+ int ystride;
+ int nhfrags;
+ memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
+ fplane=_state->fplanes+_pli;
+ nhfrags=fplane->nhfrags;
+ fragi_top=fplane->froffset;
+ fragi_bot=fragi_top+fplane->nfrags;
+ fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ ystride=_state->ref_ystride[_pli];
+ frags=_state->frags;
+ frag_buf_offs=_state->frag_buf_offs;
+ ref_frame_data=_state->ref_frame_data[_refi];
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
+ while(fragi0<fragi0_end){
+ ptrdiff_t fragi;
+ ptrdiff_t fragi_end;
+ fragi=fragi0;
+ fragi_end=fragi+nhfrags;
+ while(fragi<fragi_end){
+ if(frags[fragi].coded){
+ unsigned char *ref;
+ ref=ref_frame_data+frag_buf_offs[fragi];
+ if(fragi>fragi0){
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+ }
+ if(fragi0>fragi_top){
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+ }
+ if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
+ }
+ if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
+ }
+ }
+ fragi++;
+ }
+ fragi0+=nhfrags;
+ }
+}
+
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+ memset(_bv,~(_flimit<<1),8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+ The filter may be run on the bottom edge, affecting pixels in the next row of
+ fragments, so this row also needs to be available.
+ _bv: The bounding values array.
+ _refi: The index of the frame buffer to filter.
+ _pli: The color plane to filter.
+ _fragy0: The Y coordinate of the first fragment row to filter.
+ _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ const oc_fragment_plane *fplane;
+ const oc_fragment *frags;
+ const ptrdiff_t *frag_buf_offs;
+ unsigned char *ref_frame_data;
+ ptrdiff_t fragi_top;
+ ptrdiff_t fragi_bot;
+ ptrdiff_t fragi0;
+ ptrdiff_t fragi0_end;
+ int ystride;
+ int nhfrags;
+ fplane=_state->fplanes+_pli;
+ nhfrags=fplane->nhfrags;
+ fragi_top=fplane->froffset;
+ fragi_bot=fragi_top+fplane->nfrags;
+ fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+ ystride=_state->ref_ystride[_pli];
+ frags=_state->frags;
+ frag_buf_offs=_state->frag_buf_offs;
+ ref_frame_data=_state->ref_frame_data[_refi];
+ /*The following loops are constructed somewhat non-intuitively on purpose.
+ The main idea is: if a block boundary has at least one coded fragment on
+ it, the filter is applied to it.
+ However, the order that the filters are applied in matters, and VP3 chose
+ the somewhat strange ordering used below.*/
+ while(fragi0<fragi0_end){
+ ptrdiff_t fragi;
+ ptrdiff_t fragi_end;
+ fragi=fragi0;
+ fragi_end=fragi+nhfrags;
+ while(fragi<fragi_end){
+ if(frags[fragi].coded){
+ unsigned char *ref;
+ ref=ref_frame_data+frag_buf_offs[fragi];
+ if(fragi>fragi0){
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+ }
+ if(fragi0>fragi_top){
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+ }
+ if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
+ }
+ if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
+ }
+ }
+ fragi++;
+ }
+ fragi0+=nhfrags;
+ }
+}
+
+#endif
diff --git a/media/libtheora/lib/x86/sse2idct.c b/media/libtheora/lib/x86/sse2idct.c
new file mode 100644
index 0000000000..4597ab074f
--- /dev/null
+++ b/media/libtheora/lib/x86/sse2idct.c
@@ -0,0 +1,456 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86int.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*A table of constants used by the MMX routines.*/
+const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+ 8, 8, 8, 8, 8, 8, 8, 8,
+ OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+ OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+ OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+ OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+ OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+ OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+ OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
+/*Performs the first three stages of the iDCT.
+ xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+ (accessed in that order).
+ The remaining rows must be in _x at their corresponding locations.
+ On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+ contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC(_x) \
+ "#OC_IDCT_8x8_ABC\n\t" \
+ /*Stage 1:*/ \
+ /*2-3 rotation by 6pi/16. \
+ xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+ "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
+ "movdqa %%xmm1,%%xmm0\n\t" \
+ "pmulhw %%xmm2,%%xmm1\n\t" \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ "pmulhw %%xmm6,%%xmm0\n\t" \
+ "pmulhw %%xmm2,%%xmm7\n\t" \
+ "pmulhw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm6,%%xmm0\n\t" \
+ "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm2\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ "paddw %%xmm4,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*5-6 rotation by 3pi/16. \
+ xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ "pmulhw %%xmm3,%%xmm4\n\t" \
+ "pmulhw %%xmm5,%%xmm1\n\t" \
+ "pmulhw %%xmm3,%%xmm6\n\t" \
+ "pmulhw %%xmm5,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm4\n\t" \
+ "paddw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm6,%%xmm3\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
+ /*4-7 rotation by 7pi/16. \
+ xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+ "movdqa %%xmm3,%%xmm0\n\t" \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ "pmulhw %%xmm5,%%xmm3\n\t" \
+ "pmulhw %%xmm5,%%xmm7\n\t" \
+ "pmulhw %%xmm6,%%xmm4\n\t" \
+ "pmulhw %%xmm6,%%xmm0\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
+ "paddw %%xmm5,%%xmm7\n\t" \
+ "psubw %%xmm4,%%xmm3\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
+ /*0-1 butterfly. \
+ xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+ "paddw %%xmm7,%%xmm6\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ "pmulhw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm6,%%xmm7\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ /*Stage 2:*/ \
+ /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+ 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ "psubw %%xmm1,%%xmm6\n\t" \
+ "movdqa %%xmm5,%%xmm1\n\t" \
+ "pmulhw %%xmm7,%%xmm5\n\t" \
+ "paddw %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm7\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm2,%%xmm7\n\t" \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ "pmulhw %%xmm6,%%xmm1\n\t" \
+ "pmulhw %%xmm7,%%xmm2\n\t" \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+ /*Stage 3: \
+ 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+ 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+ 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+ "paddw %%xmm2,%%xmm1\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm7\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm1,%%xmm2\n\t" \
+ "psubw %%xmm7,%%xmm4\n\t" \
+ "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+ On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+ contain rows 4 through 7.
+ On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+ "#OC_IDCT_8x8_D\n\t" \
+ /*Stage 4: \
+ 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+ 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+ 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+ 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "psubw %%xmm1,%%xmm6\n\t" \
+ "psubw %%xmm2,%%xmm5\n\t" \
+ "psubw %%xmm3,%%xmm4\n\t" \
+ "paddw %%xmm0,%%xmm0\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ "paddw %%xmm5,%%xmm2\n\t" \
+ "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+ On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+ contain rows 4 through 7.
+ On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+ "#OC_IDCT_8x8_D_STORE\n\t" \
+ /*Stage 4: \
+ 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+ 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+ 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+ 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+ "psubw %%xmm3,%%xmm4\n\t" \
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "psubw %%xmm1,%%xmm6\n\t" \
+ "psubw %%xmm2,%%xmm5\n\t" \
+ "paddw %%xmm4,%%xmm7\n\t" \
+ "paddw %%xmm4,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm5\n\t" \
+ "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
+ "paddw %%xmm0,%%xmm0\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ "psraw $4,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm2\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
+ "psraw $4,%%xmm1\n\t" \
+ "paddw %%xmm4,%%xmm3\n\t" \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
+ "psraw $4,%%xmm2\n\t" \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
+ "psraw $4,%%xmm3\n\t" \
+ "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
+ "psraw $4,%%xmm4\n\t" \
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+ "psraw $4,%%xmm5\n\t" \
+ "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
+ "psraw $4,%%xmm6\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
+ "psraw $4,%%xmm7\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ OC_ALIGN16(ogg_int16_t buf[16]);
+ int i;
+ /*This routine accepts an 8x8 matrix pre-transposed.*/
+ __asm__ __volatile__(
+ /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+ "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+ "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+ "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+ "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+ OC_IDCT_8x8_ABC(x)
+ OC_IDCT_8x8_D
+ OC_TRANSPOSE_8x8
+ /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+ OC_IDCT_8x8_ABC(y)
+ OC_IDCT_8x8_D_STORE
+ :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+ [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+ :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+ [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+ );
+ __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+ /*Clear input data for next block (decoder only).*/
+ for(i=0;i<2;i++){
+ __asm__ __volatile__(
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+ );
+ }
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+ need to work with four columns at a time.
+ Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+ "#OC_IDCT_8x8_10_MMX\n\t" \
+ /*Stage 1:*/ \
+ /*2-3 rotation by 6pi/16. \
+ mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+ "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
+ "pmulhw %%mm2,%%mm6\n\t" \
+ "pmulhw %%mm2,%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
+ "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*5-6 rotation by 3pi/16. \
+ mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+ "pmulhw %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm3,%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+ /*4-7 rotation by 7pi/16. \
+ mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+ "pmulhw %%mm1,%%mm3\n\t" \
+ "pmulhw %%mm1,%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+ "movq %%mm3,%%mm6\n\t" \
+ "paddw %%mm1,%%mm7\n\t" \
+ /*0-1 butterfly. \
+ mm4=C4, mm0=X0, X4=0.*/ \
+ /*Stage 2:*/ \
+ /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+ 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+ "psubw %%mm5,%%mm3\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movq %%mm4,%%mm1\n\t" \
+ "pmulhw %%mm0,%%mm4\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "movq %%mm7,%%mm0\n\t" \
+ "movq %%mm4,%%mm5\n\t" \
+ "paddw %%mm2,%%mm0\n\t" \
+ "psubw %%mm2,%%mm7\n\t" \
+ "movq %%mm1,%%mm2\n\t" \
+ "pmulhw %%mm6,%%mm1\n\t" \
+ "pmulhw %%mm7,%%mm2\n\t" \
+ "paddw %%mm6,%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+ "paddw %%mm7,%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+ /*Stage 3: \
+ 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+ 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+ 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "paddw %%mm4,%%mm7\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "psubw %%mm7,%%mm4\n\t" \
+ "psubw %%mm6,%%mm5\n\t" \
+ /*Stage 4: \
+ 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+ 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+ 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+ 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "psubw %%mm1,%%mm6\n\t" \
+ "psubw %%mm2,%%mm5\n\t" \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm7,%%mm0\n\t" \
+ "paddw %%mm6,%%mm1\n\t" \
+ "paddw %%mm5,%%mm2\n\t" \
+ "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+ "#OC_IDCT_8x8_10_ABC\n\t" \
+ /*Stage 1:*/ \
+ /*2-3 rotation by 6pi/16. \
+ xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+ "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
+ "pmulhw %%xmm2,%%xmm6\n\t" \
+ "pmulhw %%xmm2,%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
+ "paddw %%xmm6,%%xmm2\n\t" \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*5-6 rotation by 3pi/16. \
+ xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+ "pmulhw %%xmm3,%%xmm5\n\t" \
+ "pmulhw %%xmm3,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
+ "paddw %%xmm3,%%xmm5\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+ /*4-7 rotation by 7pi/16. \
+ xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+ "pmulhw %%xmm1,%%xmm3\n\t" \
+ "pmulhw %%xmm1,%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm7\n\t" \
+ /*0-1 butterfly. \
+ xmm4=C4, xmm0=X0, X4=0.*/ \
+ /*Stage 2:*/ \
+ /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+ 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+ "psubw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "movdqa %%xmm4,%%xmm1\n\t" \
+ "pmulhw %%xmm0,%%xmm4\n\t" \
+ "paddw %%xmm0,%%xmm4\n\t" \
+ "movdqa %%xmm7,%%xmm0\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm2,%%xmm7\n\t" \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ "pmulhw %%xmm6,%%xmm1\n\t" \
+ "pmulhw %%xmm7,%%xmm2\n\t" \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+ /*Stage 3: \
+ 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+ 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+ 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+ "paddw %%xmm2,%%xmm1\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm7\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm1,%%xmm2\n\t" \
+ "psubw %%xmm7,%%xmm4\n\t" \
+ "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ OC_ALIGN16(ogg_int16_t buf[16]);
+ /*This routine accepts an 8x8 matrix pre-transposed.*/
+ __asm__ __volatile__(
+ "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+ "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+ "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+ "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
+ OC_IDCT_8x8_10_MMX
+ OC_TRANSPOSE_8x4_MMX2SSE
+ OC_IDCT_8x8_10_ABC
+ OC_IDCT_8x8_D_STORE
+ :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+ [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+ );
+ /*Clear input data for next block (decoder only).*/
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+ );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+ The input is assumed to be scaled by a factor of 4 relative to orthonormal
+ version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+ /*_last_zzi is subtly different from an actual count of the number of
+ coefficients we decoded for this block.
+ It contains the value of zzi BEFORE the final token in the block was
+ decoded.
+ In most cases this is an EOB token (the continuation of an EOB run from a
+ previous block counts), and so this is the same as the coefficient count.
+ However, in the case that the last token was NOT an EOB token, but filled
+ the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+ Provided the last token was not a pure zero run, the minimum value it can
+ be is 46, and so that doesn't affect any of the cases in this routine.
+ However, if the last token WAS a pure zero run of length 63, then _last_zzi
+ will be 1 while the number of coefficients decoded is 64.
+ Thus, we will trigger the following special case, where the real
+ coefficient count would not.
+ Note also that a zero run of length 64 will give _last_zzi a value of 0,
+ but we still process the DC coefficient, which might have a non-zero value
+ due to DC prediction.
+ Although convoluted, this is arguably the correct behavior: it allows us to
+ use a smaller transform when the block ends with a long zero run instead
+ of a normal EOB token.
+ It could be smarter... multiple separate zero runs at the end of a block
+ will fool it, but an encoder that generates these really deserves what it
+ gets.
+ Needless to say we inherited this approach from VP3.*/
+ /*Then perform the iDCT.*/
+ if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+ else oc_idct8x8_slow_sse2(_y,_x);
+}
+
+#endif
diff --git a/media/libtheora/lib/x86/sse2trans.h b/media/libtheora/lib/x86/sse2trans.h
new file mode 100644
index 0000000000..e76da5140b
--- /dev/null
+++ b/media/libtheora/lib/x86/sse2trans.h
@@ -0,0 +1,242 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "x86int.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+ By clever choices of the order to apply the butterflies and the order of
+ their outputs, we can take the rows in order and output the columns in order
+ without any extra operations and using just one temporary register.*/
+# define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+ Again, the butterflies are carefully arranged to get the columns to come out
+ in order, minimizing register spills and maximizing the delay between a load
+ and when the value loaded is actually used.*/
+# define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
+
+# endif
+
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+ four SSE registers.
+ No need to be clever here; we have plenty of room.*/
+# define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
+#endif
diff --git a/media/libtheora/lib/x86/x86cpu.c b/media/libtheora/lib/x86/x86cpu.c
new file mode 100644
index 0000000000..49fd76d0ac
--- /dev/null
+++ b/media/libtheora/lib/x86/x86cpu.c
@@ -0,0 +1,182 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+ Originally written by Rudolf Marek.
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+ return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+ compiling with -fPIC.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "cpuid\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# else
+/*On x86-32, not so much.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ "cpuid\n\t" \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+ if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+ if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+ if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+ if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+ if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+ return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+ if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+ if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+ if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+ if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+ return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags;
+ ogg_uint32_t eax;
+ ogg_uint32_t ebx;
+ ogg_uint32_t ecx;
+ ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+ /*Not all x86-32 chips support cpuid, so we have to check.*/
+ __asm__ __volatile__(
+ "pushfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "movl %[a],%[b]\n\t"
+ "xorl $0x200000,%[a]\n\t"
+ "pushl %[a]\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "popfl\n\t"
+ :[a]"=r"(eax),[b]"=r"(ebx)
+ :
+ :"cc"
+ );
+ /*No cpuid.*/
+ if(eax==ebx)return 0;
+# endif
+ cpuid(0,eax,ebx,ecx,edx);
+ /* l e t n I e n i u n e G*/
+ if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+ /* 6 8 x M T e n i u n e G*/
+ ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+ int family;
+ int model;
+ /*Intel, Transmeta (tested with Crusoe TM5800):*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ family=(eax>>8)&0xF;
+ model=(eax>>4)&0xF;
+ /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+ unit, so don't use it.*/
+ if(family==6&&(model==9||model==13||model==14)){
+ flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+ }
+ }
+ /* D M A c i t n e h t u A*/
+ else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+ /* C S N y b e d o e G*/
+ ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+ /*AMD, Geode:*/
+ cpuid(0x80000000,eax,ebx,ecx,edx);
+ if(eax<0x80000001)flags=0;
+ else{
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ flags=oc_parse_amd_flags(edx,ecx);
+ }
+ /*Also check for SSE.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags|=oc_parse_intel_flags(edx,ecx);
+ }
+ /*Technically some VIA chips can be configured in the BIOS to return any
+ string here the user wants.
+ There is a special detection method that can be used to identify such
+ processors, but in my opinion, if the user really wants to change it, they
+ deserve what they get.*/
+ /* s l u a H r u a t n e C*/
+ else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+ /*VIA:*/
+ /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+ chips (thanks to the engineers from Centaur Technology who provided it).
+ These chips support Intel-like cpuid info.
+ The C3-2 (Nehemiah) cores appear to, as well.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ if(eax>=0x80000001){
+ /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+ We need to check this even if the Intel test succeeds to pick up 3DNow!
+ support on these processors.
+ Unlike actual AMD processors, we cannot _rely_ on this info, since
+ some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+ this function, yet return edx=0, despite the Intel test indicating
+ MMX support.
+ Therefore the features detected here are strictly added to those
+ detected by the Intel test.*/
+ /*TODO: How about earlier chips?*/
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ /*Note: As of the C7, this function returns Intel-style extended feature
+ flags, not AMD-style.
+ Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+ do not conflict with any of the AMD flags we inspect.
+ For the remaining bits, Intel tells us, "Do not count on their value",
+ but VIA assures us that they will all be zero (at least on the C7 and
+ Isaiah chips).
+ In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+ (0xC0C00000) for something else, we will have to add code to detect
+ the model to decide when it is appropriate to inspect them.*/
+ flags|=oc_parse_amd_flags(edx,ecx);
+ }
+ }
+ else{
+ /*Implement me.*/
+ flags=0;
+ }
+ return flags;
+}
+#endif
diff --git a/media/libtheora/lib/x86/x86cpu.h b/media/libtheora/lib/x86/x86cpu.h
new file mode 100644
index 0000000000..e0192d52e2
--- /dev/null
+++ b/media/libtheora/lib/x86/x86cpu.h
@@ -0,0 +1,36 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX (1<<0)
+#define OC_CPU_X86_3DNOW (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT (1<<3)
+#define OC_CPU_X86_SSE (1<<4)
+#define OC_CPU_X86_SSE2 (1<<5)
+#define OC_CPU_X86_PNI (1<<6)
+#define OC_CPU_X86_SSSE3 (1<<7)
+#define OC_CPU_X86_SSE4_1 (1<<8)
+#define OC_CPU_X86_SSE4_2 (1<<9)
+#define OC_CPU_X86_SSE4A (1<<10)
+#define OC_CPU_X86_SSE5 (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
diff --git a/media/libtheora/lib/x86/x86int.h b/media/libtheora/lib/x86/x86int.h
new file mode 100644
index 0000000000..ceb2dbb0ec
--- /dev/null
+++ b/media/libtheora/lib/x86/x86int.h
@@ -0,0 +1,122 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_H)
+# define _x86_x86int_H (1)
+# include "../internal.h"
+
+# if defined(OC_X86_ASM)
+# define oc_state_accel_init oc_state_accel_init_x86
+# if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+ If the best routine we have available only needs SSE2 (which at the moment
+ covers all of them), then we can avoid runtime detection and the indirect
+ call.*/
+# define oc_frag_copy(_state,_dst,_src,_ystride) \
+ oc_frag_copy_mmx(_dst,_src,_ystride)
+# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs)
+# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
+ oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+ oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+ oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
+# define oc_idct8x8(_state,_y,_x,_last_zzi) \
+ oc_idct8x8_sse2(_y,_x,_last_zzi)
+# define oc_state_frag_recon oc_state_frag_recon_mmx
+# define oc_loop_filter_init(_state,_bv,_flimit) \
+ oc_loop_filter_init_mmxext(_bv,_flimit)
+# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
+# define oc_restore_fpu(_state) \
+ oc_restore_fpu_mmx()
+# else
+# define OC_STATE_USE_VTABLE (1)
+# endif
+# endif
+
+# include "../state.h"
+# include "x86cpu.h"
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+ To avoid warnings, we force an offset with %H (which adds 8).*/
+# if __GNUC_PREREQ(4,0)
+# define OC_MEM_OFFS(_offs,_name) \
+ OC_M2STR(_offs-8+%H[_name])
+# endif
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+ Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+ whole offset, instead of substituting in 0 for the missing operand to +.*/
+# if !defined(OC_MEM_OFFS)
+# define OC_MEM_OFFS(_offs,_name) \
+ OC_M2STR(_offs+%[_name])
+# endif
+
+/*Declare an array operand with an exact size.
+ This tells gcc we're going to clobber this memory region, without having to
+ clobber all of "memory" and lets us access local buffers directly using the
+ stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+ (*({ \
+ struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
+ array_addr__; \
+ }))
+
+/*Declare an array operand with an exact size.
+ This tells gcc we're going to clobber this memory region, without having to
+ clobber all of "memory" and lets us access local buffers directly using the
+ stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+ (*({ \
+ const struct{_type array_value__[(_size)];} *array_addr__= \
+ (const void *)(_ptr); \
+ array_addr__; \
+ }))
+
+extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
+void oc_state_accel_init_x86(oc_theora_state *_state);
+
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_mmx(void);
+
+#endif
diff --git a/media/libtheora/lib/x86/x86state.c b/media/libtheora/lib/x86/x86state.c
new file mode 100644
index 0000000000..9f8bceb534
--- /dev/null
+++ b/media/libtheora/lib/x86/x86state.c
@@ -0,0 +1,97 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+#if defined(OC_STATE_USE_VTABLE)
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+ each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3,32,11,18,25, 4,12,
+ 5,26,19,40,33,34,41,48,
+ 27, 6,13,20,28,21,14, 7,
+ 56,49,42,35,43,50,57,36,
+ 15,22,29,30,23,44,37,58,
+ 51,59,38,45,52,31,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64
+};
+#endif
+
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+ the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3, 4,11,18,25,32,40,
+ 33,26,19,12, 5, 6,13,20,
+ 27,34,41,48,56,49,42,35,
+ 28,21,14, 7,15,22,29,36,
+ 43,50,57,58,51,44,37,30,
+ 23,31,38,45,52,59,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64
+};
+
+void oc_state_accel_init_x86(oc_theora_state *_state){
+ oc_state_accel_init_c(_state);
+ _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+ if(_state->cpu_flags&OC_CPU_X86_MMX){
+ _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+ _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ oc_state_loop_filter_frag_rows_mmx;
+ _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
+ }
+ if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ oc_state_loop_filter_frag_rows_mmxext;
+ }
+ if(_state->cpu_flags&OC_CPU_X86_SSE2){
+ _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+# endif
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+# if defined(OC_STATE_USE_VTABLE)
+ }
+# endif
+}
+#endif