20 files changed, 3371 insertions, 0 deletions
diff --git a/media/ffvpx/libavcodec/arm/flacdsp_arm.S b/media/ffvpx/libavcodec/arm/flacdsp_arm.S
new file mode 100644
index 0000000000..f8861c5967
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/flacdsp_arm.S
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function flac_lpc_16_1_arm
+        ldr             r12, [sp]
+        push            {r4, lr}
+        ldr             r1,  [r1]
+        subs            r12, r12, #2
+        ldr             lr,  [r0], #4
+        beq             2f
+        it              lt
+        poplt           {r4, pc}
+1:
+        mul             r4,  lr,  r1
+        ldm             r0,  {r2, lr}
+        add_sh          r2,  r2,  r4,  asr r3
+        mul             r4,  r2,  r1
+        subs            r12, r12, #2
+        add_sh          lr,  lr,  r4,  asr r3
+        stm             r0!, {r2, lr}
+        bgt             1b
+        it              lt
+        poplt           {r4, pc}
+2:
+        mul             r4,  lr,  r1
+        ldr             r2,  [r0]
+        add_sh          r2,  r2,  r4,  asr r3
+        str             r2,  [r0]
+        pop             {r4, pc}
+endfunc
+
+function flac_lpc_16_2_arm
+        ldr             r12, [sp]
+        subs            r12, r12, r2
+        it              le
+        bxle            lr
+
+        push            {r4-r9, lr}
+        ldm             r0!, {r6, r7}
+        ldm             r1,  {r8, r9}
+        subs            r12, r12, #1
+        beq             2f
+1:
+        mul             r4,  r6,  r8
+        mul             r5,  r7,  r8
+        mla             r4,  r7,  r9,  r4
+        ldm             r0,  {r6, r7}
+        add_sh          r6,  r6,  r4,  asr r3
+        mla             r5,  r6,  r9,  r5
+        add_sh          r7,  r7,  r5,  asr r3
+        stm             r0!, {r6, r7}
+        subs            r12, r12, #2
+        bgt             1b
+        it              lt
+        poplt           {r4-r9, pc}
+2:
+        mul             r4,  r6,  r8
+        mla             r4,  r7,  r9,  r4
+        ldr             r5,  [r0]
+        add_sh          r5,  r5,  r4,  asr r3
+        str             r5,  [r0]
+        pop             {r4-r9, pc}
+endfunc
+
+function ff_flac_lpc_16_arm, export=1
+        cmp             r2,  #2
+        blt             flac_lpc_16_1_arm
+        beq             flac_lpc_16_2_arm
+
+        ldr             r12, [sp]
+        subs            r12, r12, r2
+        it              le
+        bxle            lr
+
+        push            {r4-r9, lr}
+
+        subs            r12, r12, #1
+        beq             3f
+1:
+        sub             lr,  r2,  #2
+        mov             r4,  #0
+        mov             r5,  #0
+
+        ldr             r7,  [r0], #4
+        ldr             r9,  [r1], #4
+2:
+        mla             r4,  r7,  r9,  r4
+        ldm             r0!, {r6, r7}
+        mla             r5,  r6,  r9,  r5
+        ldm             r1!, {r8, r9}
+        mla             r4,  r6,  r8,  r4
+        subs            lr,  lr,  #2
+        mla             r5,  r7,  r8,  r5
+        bgt             2b
+        blt             6f
+
+        mla             r4,  r7,  r9,  r4
+        ldr             r7,  [r0], #4
+        mla             r5,  r7,  r9,  r5
+        ldr             r9,  [r1], #4
+6:
+        mla             r4,  r7,  r9,  r4
+        ldm             r0,  {r6, r7}
+        add_sh          r6,  r6,  r4,  asr r3
+        mla             r5,  r6,  r9,  r5
+        add_sh          r7,  r7,  r5,  asr r3
+        stm             r0!, {r6, r7}
+        sub             r0,  r0,  r2,  lsl #2
+        sub             r1,  r1,  r2,  lsl #2
+
+        subs            r12, r12, #2
+        bgt             1b
+        it              lt
+        poplt           {r4-r9, pc}
+3:
+        mov             r4,  #0
+4:
+        ldr             r5,  [r1], #4
+        ldr             r6,  [r0], #4
+        mla             r4,  r5,  r6,  r4
+        subs            r2,  r2,  #1
+        bgt             4b
+        ldr             r5,  [r0]
+        add_sh          r5,  r5,  r4,  asr r3
+        str             r5,  [r0]
+        pop             {r4-r9, pc}
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c b/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c
new file mode 100644
index 0000000000..9962cc89f4
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/flacdsp_init_arm.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/flacdsp.h"
+
+void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
+                        int qlevel, int len);
+
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels)
+{
+    c->lpc16 = ff_flac_lpc_16_arm;
+}
diff --git a/media/ffvpx/libavcodec/arm/idct.h b/media/ffvpx/libavcodec/arm/idct.h
new file mode 100644
index 0000000000..6c79a69c5f
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idct.h
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_IDCT_H
+#define AVCODEC_ARM_IDCT_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void ff_j_rev_dct_arm(int16_t *data);
+
+void ff_simple_idct_arm(int16_t *data);
+
+void ff_simple_idct_armv5te(int16_t *data);
+void ff_simple_idct_put_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_armv5te(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+void ff_simple_idct_armv6(int16_t *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_ARM_IDCT_H */
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_arm.S b/media/ffvpx/libavcodec/arm/idctdsp_arm.S
new file mode 100644
index 0000000000..057eff9be8
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_arm.S
@@ -0,0 +1,120 @@
+@
+@ ARMv4-optimized IDCT functions
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
+function ff_add_pixels_clamped_arm, export=1, align=5
+        push            {r4-r10}
+        mov             r10, #8
+1:
+        ldr             r4,  [r1]               /* load dest */
+        /* block[0] and block[1]*/
+        ldrsh           r5,  [r0]
+        ldrsh           r7,  [r0, #2]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r6,  r5
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #4]           /* moved form [A] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[2] and block[3] */
+        /* [A] */
+        ldrsh           r7,  [r0, #6]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        ldr             r4,  [r1, #4]           /* moved form [B] */
+        orr             r9,  r9,  r8,  lsl #24
+        /* store dest */
+        ldrsh           r5,  [r0, #8]           /* moved form [C] */
+        str             r9,  [r1]
+
+        /* load dest */
+        /* [B] */
+        /* block[4] and block[5] */
+        /* [C] */
+        ldrsh           r7,  [r0, #10]
+        and             r6,  r4,  #0xFF
+        and             r8,  r4,  #0xFF00
+        add             r6,  r6,  r5
+        add             r8,  r7,  r8,  lsr #8
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        mov             r9,  r6
+        ldrsh           r5,  [r0, #12]          /* moved from [D] */
+        orr             r9,  r9,  r8,  lsl #8
+        /* block[6] and block[7] */
+        /* [D] */
+        ldrsh           r7,  [r0, #14]
+        and             r6,  r4,  #0xFF0000
+        and             r8,  r4,  #0xFF000000
+        add             r6,  r5,  r6,  lsr #16
+        add             r8,  r7,  r8,  lsr #24
+        mvn             r5,  r5
+        mvn             r7,  r7
+        tst             r6,  #0x100
+        it              ne
+        movne           r6,  r5,  lsr #24
+        tst             r8,  #0x100
+        it              ne
+        movne           r8,  r7,  lsr #24
+        orr             r9,  r9,  r6,  lsl #16
+        add             r0,  r0,  #16           /* moved from [E] */
+        orr             r9,  r9,  r8,  lsl #24
+        subs            r10, r10, #1            /* moved from [F] */
+        /* store dest */
+        str             r9,  [r1, #4]
+
+        /* [E] */
+        /* [F] */
+        add             r1,  r1,  r2
+        bne             1b
+
+        pop             {r4-r10}
+        bx              lr
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_arm.h b/media/ffvpx/libavcodec/arm/idctdsp_arm.h
new file mode 100644
index 0000000000..d7bc5cd02a
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_arm.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_IDCTDSP_ARM_H
+#define AVCODEC_ARM_IDCTDSP_ARM_H
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+
+void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
+                             unsigned high_bit_depth);
+void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
+                           unsigned high_bit_depth);
+void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
+                          unsigned high_bit_depth);
+
+#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_armv6.S b/media/ffvpx/libavcodec/arm/idctdsp_armv6.S
new file mode 100644
index 0000000000..a6e77d6da1
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_armv6.S
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_add_pixels_clamped_armv6, export=1
+        push            {r4-r8,lr}
+        mov             r3,  #8
+1:
+        ldm             r0!, {r4,r5,r12,lr}
+        ldrd            r6,  r7,  [r1]
+        pkhbt           r8,  r4,  r5,  lsl #16
+        pkhtb           r5,  r5,  r4,  asr #16
+        pkhbt           r4,  r12, lr,  lsl #16
+        pkhtb           lr,  lr,  r12, asr #16
+        pld             [r1, r2]
+        uxtab16         r8,  r8,  r6
+        uxtab16         r5,  r5,  r6,  ror #8
+        uxtab16         r4,  r4,  r7
+        uxtab16         lr,  lr,  r7,  ror #8
+        usat16          r8,  #8,  r8
+        usat16          r5,  #8,  r5
+        usat16          r4,  #8,  r4
+        usat16          lr,  #8,  lr
+        orr             r6,  r8,  r5,  lsl #8
+        orr             r7,  r4,  lr,  lsl #8
+        subs            r3,  r3,  #1
+        strd_post       r6,  r7,  r1,  r2
+        bgt             1b
+        pop             {r4-r8,pc}
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c b/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c
new file mode 100644
index 0000000000..ebc90e4b49
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_init_arm.c
@@ -0,0 +1,94 @@
+/*
+ * ARM-optimized IDCT functions
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+#include "idctdsp_arm.h"
+
+void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
+                               ptrdiff_t line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+ * converted */
+static void j_rev_dct_arm_put(uint8_t *dest, ptrdiff_t line_size,
+                              int16_t *block)
+{
+    ff_j_rev_dct_arm(block);
+    ff_put_pixels_clamped_c(block, dest, line_size);
+}
+
+static void j_rev_dct_arm_add(uint8_t *dest, ptrdiff_t line_size,
+                              int16_t *block)
+{
+    ff_j_rev_dct_arm(block);
+    ff_add_pixels_clamped_arm(block, dest, line_size);
+}
+
+static void simple_idct_arm_put(uint8_t *dest, ptrdiff_t line_size,
+                                int16_t *block)
+{
+    ff_simple_idct_arm(block);
+    ff_put_pixels_clamped_c(block, dest, line_size);
+}
+
+static void simple_idct_arm_add(uint8_t *dest, ptrdiff_t line_size,
+                                int16_t *block)
+{
+    ff_simple_idct_arm(block);
+    ff_add_pixels_clamped_arm(block, dest, line_size);
+}
+
+av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
+                                 unsigned high_bit_depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
+            avctx->idct_algo == FF_IDCT_ARM) {
+            c->idct_put  = j_rev_dct_arm_put;
+            c->idct_add  = j_rev_dct_arm_add;
+            c->idct      = ff_j_rev_dct_arm;
+            c->perm_type = FF_IDCT_PERM_LIBMPEG2;
+        } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) {
+            c->idct_put  = simple_idct_arm_put;
+            c->idct_add  = simple_idct_arm_add;
+            c->idct      = ff_simple_idct_arm;
+            c->perm_type = FF_IDCT_PERM_NONE;
+        }
+    }
+
+    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
+
+    if (have_armv5te(cpu_flags))
+        ff_idctdsp_init_armv5te(c, avctx, high_bit_depth);
+    if (have_armv6(cpu_flags))
+        ff_idctdsp_init_armv6(c, avctx, high_bit_depth);
+    if (have_neon(cpu_flags))
+        ff_idctdsp_init_neon(c, avctx, high_bit_depth);
+}
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c b/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c
new file mode 100644
index 0000000000..3d881e1f18
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_init_armv5te.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+#include "idctdsp_arm.h"
+
+av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth &&
+        (avctx->idct_algo == FF_IDCT_AUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+         avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
+        c->idct_put  = ff_simple_idct_put_armv5te;
+        c->idct_add  = ff_simple_idct_add_armv5te;
+        c->idct      = ff_simple_idct_armv5te;
+        c->perm_type = FF_IDCT_PERM_NONE;
+    }
+}
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c b/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c
new file mode 100644
index 0000000000..edf3070e15
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_init_armv6.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+#include "idctdsp_arm.h"
+
+void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
+                                 ptrdiff_t line_size);
+
+av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
+                                   unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth) {
+        if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
+            avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
+            c->idct_put  = ff_simple_idct_put_armv6;
+            c->idct_add  = ff_simple_idct_add_armv6;
+            c->idct      = ff_simple_idct_armv6;
+            c->perm_type = FF_IDCT_PERM_LIBMPEG2;
+        }
+    }
+    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
+}
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c b/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c
new file mode 100644
index 0000000000..b70c5b0d44
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_init_neon.c
@@ -0,0 +1,51 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+#include "idctdsp_arm.h"
+
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
+
+av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
+                                  unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put  = ff_simple_idct_put_neon;
+            c->idct_add  = ff_simple_idct_add_neon;
+            c->idct      = ff_simple_idct_neon;
+            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+        }
+    }
+
+    c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
+    c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
+    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+}
diff --git a/media/ffvpx/libavcodec/arm/idctdsp_neon.S b/media/ffvpx/libavcodec/arm/idctdsp_neon.S
new file mode 100644
index 0000000000..1911a33468
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/idctdsp_neon.S
@@ -0,0 +1,128 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_put_pixels_clamped_neon, export=1
+        vld1.16         {d16-d19}, [r0,:128]!
+        vqmovun.s16     d0, q8
+        vld1.16         {d20-d23}, [r0,:128]!
+        vqmovun.s16     d1, q9
+        vld1.16         {d24-d27}, [r0,:128]!
+        vqmovun.s16     d2, q10
+        vld1.16         {d28-d31}, [r0,:128]!
+        vqmovun.s16     d3, q11
+        vst1.8          {d0},      [r1,:64], r2
+        vqmovun.s16     d4, q12
+        vst1.8          {d1},      [r1,:64], r2
+        vqmovun.s16     d5, q13
+        vst1.8          {d2},      [r1,:64], r2
+        vqmovun.s16     d6, q14
+        vst1.8          {d3},      [r1,:64], r2
+        vqmovun.s16     d7, q15
+        vst1.8          {d4},      [r1,:64], r2
+        vst1.8          {d5},      [r1,:64], r2
+        vst1.8          {d6},      [r1,:64], r2
+        vst1.8          {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_put_signed_pixels_clamped_neon, export=1
+        vmov.u8         d31, #128
+        vld1.16         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d0, q8
+        vld1.16         {d18-d19}, [r0,:128]!
+        vqmovn.s16      d1, q9
+        vld1.16         {d16-d17}, [r0,:128]!
+        vqmovn.s16      d2, q8
+        vld1.16         {d18-d19}, [r0,:128]!
+        vadd.u8         d0, d0, d31
+        vld1.16         {d20-d21}, [r0,:128]!
+        vadd.u8         d1, d1, d31
+        vld1.16         {d22-d23}, [r0,:128]!
+        vadd.u8         d2, d2, d31
+        vst1.8          {d0},      [r1,:64], r2
+        vqmovn.s16      d3, q9
+        vst1.8          {d1},      [r1,:64], r2
+        vqmovn.s16      d4, q10
+        vst1.8          {d2},      [r1,:64], r2
+        vqmovn.s16      d5, q11
+        vld1.16         {d24-d25}, [r0,:128]!
+        vadd.u8         d3, d3, d31
+        vld1.16         {d26-d27}, [r0,:128]!
+        vadd.u8         d4, d4, d31
+        vadd.u8         d5, d5, d31
+        vst1.8          {d3},      [r1,:64], r2
+        vqmovn.s16      d6, q12
+        vst1.8          {d4},      [r1,:64], r2
+        vqmovn.s16      d7, q13
+        vst1.8          {d5},      [r1,:64], r2
+        vadd.u8         d6, d6, d31
+        vadd.u8         d7, d7, d31
+        vst1.8          {d6},      [r1,:64], r2
+        vst1.8          {d7},      [r1,:64], r2
+        bx              lr
+endfunc
+
+function ff_add_pixels_clamped_neon, export=1
+        mov             r3, r1
+        vld1.8          {d16},   [r1,:64], r2
+        vld1.16         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vld1.8          {d17},   [r1,:64], r2
+        vld1.16         {d2-d3}, [r0,:128]!
+        vqmovun.s16     d0, q0
+        vld1.8          {d18},   [r1,:64], r2
+        vaddw.u8        q1, q1, d17
+        vld1.16         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.8          {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.8          {d19},   [r1,:64], r2
+        vld1.16         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vqmovun.s16     d4, q2
+        vst1.8          {d2},    [r3,:64], r2
+        vld1.8          {d16},   [r1,:64], r2
+        vqmovun.s16     d6, q3
+        vld1.16         {d0-d1}, [r0,:128]!
+        vaddw.u8        q0, q0, d16
+        vst1.8          {d4},    [r3,:64], r2
+        vld1.8          {d17},   [r1,:64], r2
+        vld1.16         {d2-d3}, [r0,:128]!
+        vaddw.u8        q1, q1, d17
+        vst1.8          {d6},    [r3,:64], r2
+        vqmovun.s16     d0, q0
+        vld1.8          {d18},   [r1,:64], r2
+        vld1.16         {d4-d5}, [r0,:128]!
+        vaddw.u8        q2, q2, d18
+        vst1.8          {d0},    [r3,:64], r2
+        vqmovun.s16     d2, q1
+        vld1.8          {d19},   [r1,:64], r2
+        vqmovun.s16     d4, q2
+        vld1.16         {d6-d7}, [r0,:128]!
+        vaddw.u8        q3, q3, d19
+        vst1.8          {d2},    [r3,:64], r2
+        vqmovun.s16     d6, q3
+        vst1.8          {d4},    [r3,:64], r2
+        vst1.8          {d6},    [r3,:64], r2
+        bx              lr
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/jrevdct_arm.S b/media/ffvpx/libavcodec/arm/jrevdct_arm.S
new file mode 100644
index 0000000000..f951e2af34
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/jrevdct_arm.S
@@ -0,0 +1,383 @@
+/*
+   C-like prototype :
+        void j_rev_dct_arm(DCTBLOCK data)
+
+   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
+
+   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+#include "libavutil/arm/asm.S"
+
+#define FIX_0_298631336 2446
+#define FIX_0_541196100 4433
+#define FIX_0_765366865 6270
+#define FIX_1_175875602 9633
+#define FIX_1_501321110 12299
+#define FIX_2_053119869 16819
+#define FIX_3_072711026 25172
+#define FIX_M_0_390180644 -3196
+#define FIX_M_0_899976223 -7373
+#define FIX_M_1_847759065 -15137
+#define FIX_M_1_961570560 -16069
+#define FIX_M_2_562915447 -20995
+#define FIX_0xFFFF 0xFFFF
+
+#define FIX_0_298631336_ID      0
+#define FIX_0_541196100_ID      4
+#define FIX_0_765366865_ID      8
+#define FIX_1_175875602_ID     12
+#define FIX_1_501321110_ID     16
+#define FIX_2_053119869_ID     20
+#define FIX_3_072711026_ID     24
+#define FIX_M_0_390180644_ID   28
+#define FIX_M_0_899976223_ID   32
+#define FIX_M_1_847759065_ID   36
+#define FIX_M_1_961570560_ID   40
+#define FIX_M_2_562915447_ID   44
+#define FIX_0xFFFF_ID          48
+
+function ff_j_rev_dct_arm, export=1
+        push {r0, r4 - r11, lr}
+
+        mov lr, r0                      @ lr = pointer to the current row
+        mov r12, #8                     @ r12 = row-counter
+        movrel r11, const_array         @ r11 = base pointer to the constants array
+row_loop:
+        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
+        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
+
+        @ Optimization for row that have all items except the first set to 0
+        @ (this works as the int16_t are always 4-byte aligned)
+        ldr r5, [lr, # 0]
+        ldr r6, [lr, # 4]
+        ldr r3, [lr, # 8]
+        ldr r4, [lr, #12]
+        orr r3, r3, r4
+        orr r3, r3, r6
+        orrs r5, r3, r5
+        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
+        orrs r3, r3, r2
+        beq empty_row
+
+        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
+        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
+        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
+
+        ldr r3, [r11, #FIX_0_541196100_ID]
+        add r7, r2, r6
+        ldr r5, [r11, #FIX_M_1_847759065_ID]
+        mul r7, r3, r7                      @ r7 = z1
+        ldr r3, [r11, #FIX_0_765366865_ID]
+        mla r6, r5, r6, r7                  @ r6 = tmp2
+        add r5, r0, r4                      @ r5 = tmp0
+        mla r2, r3, r2, r7                  @ r2 = tmp3
+        sub r3, r0, r4                      @ r3 = tmp1
+
+        add r0, r2, r5, lsl #13             @ r0 = tmp10
+        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
+        add r4, r6, r3, lsl #13             @ r4 = tmp11
+        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
+
+        push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+        ldrsh r3, [lr, #10]             @ r3 = 'd3'
+        ldrsh r5, [lr, #12]             @ r5 = 'd5'
+        ldrsh r7, [lr, #14]             @ r7 = 'd7'
+
+        add r0, r3, r5                        @ r0 = 'z2'
+        add r2, r1, r7                  @ r2 = 'z1'
+        add r4, r3, r7                  @ r4 = 'z3'
+        add r6, r1, r5                  @ r6 = 'z4'
+        ldr r9, [r11, #FIX_1_175875602_ID]
+        add r8, r4, r6                  @ r8 = z3 + z4
+        ldr r10, [r11, #FIX_M_0_899976223_ID]
+        mul r8, r9, r8                  @ r8 = 'z5'
+        ldr r9, [r11, #FIX_M_2_562915447_ID]
+        mul r2, r10, r2                 @ r2 = 'z1'
+        ldr r10, [r11, #FIX_M_1_961570560_ID]
+        mul r0, r9, r0                  @ r0 = 'z2'
+        ldr r9, [r11, #FIX_M_0_390180644_ID]
+        mla r4, r10, r4, r8             @ r4 = 'z3'
+        ldr r10, [r11, #FIX_0_298631336_ID]
+        mla r6, r9, r6, r8              @ r6 = 'z4'
+        ldr r9, [r11, #FIX_2_053119869_ID]
+        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
+        ldr r10, [r11, #FIX_3_072711026_ID]
+        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
+        ldr r9, [r11, #FIX_1_501321110_ID]
+        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
+        add r7, r7, r4                  @ r7 = tmp0
+        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
+        add r5,        r5, r6                  @ r5 = tmp1
+        add r3, r3, r4                  @ r3 = tmp2
+        add r1, r1, r6                  @ r1 = tmp3
+
+        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
+        add r8, r0, r1
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 0]
+
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
+        sub r8, r0, r1
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #14]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
+        add r8, r6, r3
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 2]
+
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
+        sub r8, r6, r3
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #12]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
+        add r8, r4, r5
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 4]
+
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
+        sub r8, r4, r5
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, #10]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
+        add r8, r2, r7
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 6]
+
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
+        sub r8, r2, r7
+        add r8, r8, #(1<<10)
+        mov r8, r8, asr #11
+        strh r8, [lr, # 8]
+
+        @ End of row loop
+        add lr, lr, #16
+        subs r12, r12, #1
+        bne row_loop
+        beq start_column_loop
+
+empty_row:
+        ldr r1, [r11, #FIX_0xFFFF_ID]
+        mov r0, r0, lsl #2
+        and r0, r0, r1
+        add r0, r0, r0, lsl #16
+        str r0, [lr, # 0]
+        str r0, [lr, # 4]
+        str r0, [lr, # 8]
+        str r0, [lr, #12]
+
+end_of_row_loop:
+        @ End of loop
+        add lr, lr, #16
+        subs r12, r12, #1
+        bne row_loop
+
+start_column_loop:
+        @ Start of column loop
+        pop {lr}
+        mov r12, #8
+column_loop:
+        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
+        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
+        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
+        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
+
+        ldr r3, [r11, #FIX_0_541196100_ID]
+        add r1, r2, r6
+        ldr r5, [r11, #FIX_M_1_847759065_ID]
+        mul r1, r3, r1                      @ r1 = z1
+        ldr r3, [r11, #FIX_0_765366865_ID]
+        mla r6, r5, r6, r1                  @ r6 = tmp2
+        add r5, r0, r4                      @ r5 = tmp0
+        mla r2, r3, r2, r1                  @ r2 = tmp3
+        sub r3, r0, r4                      @ r3 = tmp1
+
+        add r0, r2, r5, lsl #13             @ r0 = tmp10
+        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
+        add r4, r6, r3, lsl #13             @ r4 = tmp11
+        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
+
+        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
+        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
+        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
+        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
+
+        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
+        orr r9, r1, r3
+        orr r10, r5, r7
+        orrs r10, r9, r10
+        beq empty_odd_column
+
+        push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+        add r0, r3, r5                  @ r0 = 'z2'
+        add r2, r1, r7                  @ r2 = 'z1'
+        add r4, r3, r7                  @ r4 = 'z3'
+        add r6, r1, r5                  @ r6 = 'z4'
+        ldr r9, [r11, #FIX_1_175875602_ID]
+        add r8, r4, r6
+        ldr r10, [r11, #FIX_M_0_899976223_ID]
+        mul r8, r9, r8                  @ r8 = 'z5'
+        ldr r9, [r11, #FIX_M_2_562915447_ID]
+        mul r2, r10, r2                 @ r2 = 'z1'
+        ldr r10, [r11, #FIX_M_1_961570560_ID]
+        mul r0, r9, r0                  @ r0 = 'z2'
+        ldr r9, [r11, #FIX_M_0_390180644_ID]
+        mla r4, r10, r4, r8             @ r4 = 'z3'
+        ldr r10, [r11, #FIX_0_298631336_ID]
+        mla r6, r9, r6, r8              @ r6 = 'z4'
+        ldr r9, [r11, #FIX_2_053119869_ID]
+        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
+        ldr r10, [r11, #FIX_3_072711026_ID]
+        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
+        ldr r9, [r11, #FIX_1_501321110_ID]
+        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
+        add r7, r7, r4                  @ r7 = tmp0
+        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
+        add r5,        r5, r6                  @ r5 = tmp1
+        add r3, r3, r4                  @ r3 = tmp2
+        add r1, r1, r6                  @ r1 = tmp3
+
+        pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+                             @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
+
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+        add r8, r0, r1
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 0*8)]
+
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+        sub r8, r0, r1
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(14*8)]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+        add r8, r4, r3
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 2*8)]
+
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+        sub r8, r4, r3
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(12*8)]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+        add r8, r6, r5
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 4*8)]
+
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+        sub r8, r6, r5
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #(10*8)]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+        add r8, r2, r7
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 6*8)]
+
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+        sub r8, r2, r7
+        add r8, r8, #(1<<17)
+        mov r8, r8, asr #18
+        strh r8, [lr, #( 8*8)]
+
+        @ End of row loop
+        add lr, lr, #2
+        subs r12, r12, #1
+        bne column_loop
+        beq the_end
+
+empty_odd_column:
+        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+        add r0, r0, #(1<<17)
+        mov r0, r0, asr #18
+        strh r0, [lr, #( 0*8)]
+        strh r0, [lr, #(14*8)]
+
+        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+        add r4, r4, #(1<<17)
+        mov r4, r4, asr #18
+        strh r4, [lr, #( 2*8)]
+        strh r4, [lr, #(12*8)]
+
+        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+        add r6, r6, #(1<<17)
+        mov r6, r6, asr #18
+        strh r6, [lr, #( 4*8)]
+        strh r6, [lr, #(10*8)]
+
+        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+        add r2, r2, #(1<<17)
+        mov r2, r2, asr #18
+        strh r2, [lr, #( 6*8)]
+        strh r2, [lr, #( 8*8)]
+
+        @ End of row loop
+        add lr, lr, #2
+        subs r12, r12, #1
+        bne column_loop
+
+the_end:
+        @ The end....
+        pop {r4 - r11, pc}
+endfunc
+
+const const_array
+        .word FIX_0_298631336
+        .word FIX_0_541196100
+        .word FIX_0_765366865
+        .word FIX_1_175875602
+        .word FIX_1_501321110
+        .word FIX_2_053119869
+        .word FIX_3_072711026
+        .word FIX_M_0_390180644
+        .word FIX_M_0_899976223
+        .word FIX_M_1_847759065
+        .word FIX_M_1_961570560
+        .word FIX_M_2_562915447
+        .word FIX_0xFFFF
+endconst
diff --git a/media/ffvpx/libavcodec/arm/mathops.h b/media/ffvpx/libavcodec/arm/mathops.h
new file mode 100644
index 0000000000..dc57c5571c
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/mathops.h
@@ -0,0 +1,108 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MATHOPS_H
+#define AVCODEC_ARM_MATHOPS_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_INLINE_ASM
+
+#if HAVE_ARMV6_INLINE
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
+{
+    int r;
+    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+    return r;
+}
+
+#define FASTDIV FASTDIV
+static av_always_inline av_const int FASTDIV(int a, int b)
+{
+    int r;
+    __asm__ ("cmp     %2, #2               \n\t"
+             "ldr     %0, [%3, %2, lsl #2] \n\t"
+             "ite     le                   \n\t"
+             "lsrle   %0, %1, #1           \n\t"
+             "smmulgt %0, %0, %1           \n\t"
+             : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
+    return r;
+}
+
+#else /* HAVE_ARMV6_INLINE */
+
+#define FASTDIV FASTDIV
+static av_always_inline av_const int FASTDIV(int a, int b)
+{
+    int r, t;
+    __asm__ ("umull %1, %0, %2, %3"
+             : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
+    return r;
+}
+#endif
+
+#define MLS64(d, a, b) MAC64(d, -(a), b)
+
+#if HAVE_ARMV5TE_INLINE
+
+/* signed 16x16 -> 32 multiply add accumulate */
+#   define MAC16(rt, ra, rb)                                            \
+    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
+
+/* signed 16x16 -> 32 multiply */
+#   define MUL16 MUL16
+static inline av_const int MUL16(int ra, int rb)
+{
+    int rt;
+    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
+    return rt;
+}
+
+#endif
+
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+    int m;
+    __asm__ (
+        "mov   %0, %2  \n\t"
+        "cmp   %1, %2  \n\t"
+        "itt   gt      \n\t"
+        "movgt %0, %1  \n\t"
+        "movgt %1, %2  \n\t"
+        "cmp   %1, %3  \n\t"
+        "it    le      \n\t"
+        "movle %1, %3  \n\t"
+        "cmp   %0, %1  \n\t"
+        "it    gt      \n\t"
+        "movgt %0, %1  \n\t"
+        : "=&r"(m), "+r"(a)
+        : "r"(b), "r"(c)
+        : "cc");
+    return m;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_ARM_MATHOPS_H */
diff --git a/media/ffvpx/libavcodec/arm/moz.build b/media/ffvpx/libavcodec/arm/moz.build
new file mode 100644
index 0000000000..ab314d28ec
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/moz.build
@@ -0,0 +1,28 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+SOURCES += [
+    'flacdsp_arm.S',
+    'flacdsp_init_arm.c',
+    'idctdsp_arm.S',
+    'idctdsp_armv6.S',
+    'idctdsp_init_arm.c',
+    'idctdsp_init_armv5te.c',
+    'idctdsp_init_armv6.c',
+    'idctdsp_init_neon.c',
+    'idctdsp_neon.S',
+    'jrevdct_arm.S',
+    'mpegaudiodsp_fixed_armv6.S',
+    'mpegaudiodsp_init_arm.c',
+    'simple_idct_arm.S',
+    'simple_idct_armv5te.S',
+    'simple_idct_armv6.S',
+    'simple_idct_neon.S',
+]
+
+FINAL_LIBRARY = 'mozavcodec'
+
+include('/media/ffvpx/ffvpxcommon.mozbuild')
diff --git a/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
new file mode 100644
index 0000000000..977abb6939
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro  skip            args:vararg
+.endm
+
+.macro  sum8            lo,  hi,  w, p, t1, t2, t3, t4, rsb=skip, offs=0
+        ldr             \t1, [\w, #4*\offs]
+        ldr             \t2, [\p, #4]!
+        \rsb            \t1, \t1, #0
+  .irpc i, 135
+        ldr             \t3, [\w, #4*64*\i+4*\offs]
+        ldr             \t4, [\p, #4*64*\i]
+        smlal           \lo, \hi, \t1, \t2
+        \rsb            \t3, \t3, #0
+        ldr             \t1, [\w, #4*64*(\i+1)+4*\offs]
+        ldr             \t2, [\p, #4*64*(\i+1)]
+        smlal           \lo, \hi, \t3, \t4
+        \rsb            \t1, \t1, #0
+  .endr
+        ldr             \t3, [\w, #4*64*7+4*\offs]
+        ldr             \t4, [\p, #4*64*7]
+        smlal           \lo, \hi, \t1, \t2
+        \rsb            \t3, \t3, #0
+        smlal           \lo, \hi, \t3, \t4
+.endm
+
+.macro  round           rd,  lo,  hi
+        lsr             \rd, \lo, #24
+        bic             \lo, \lo, #0xff000000
+        orr             \rd, \rd, \hi, lsl #8
+        mov             \hi, #0
+        ssat            \rd, #16, \rd
+.endm
+
+function ff_mpadsp_apply_window_fixed_armv6, export=1
+        push            {r2,r4-r11,lr}
+
+        add             r4,  r0,  #4*512        @ synth_buf + 512
+    .rept 4
+        ldm             r0!, {r5-r12}
+        stm             r4!, {r5-r12}
+    .endr
+
+        ldr             r4,  [sp, #40]          @ incr
+        sub             r0,  r0,  #4*17         @ synth_buf + 16
+        ldr             r8,  [r2]               @ sum:low
+        add             r2,  r0,  #4*32         @ synth_buf + 48
+        rsb             r5,  r4,  r4,  lsl #5   @ 31 * incr
+        lsl             r4,  r4,  #1
+        asr             r9,  r8,  #31           @ sum:high
+        add             r5,  r3,  r5,  lsl #1   @ samples2
+        add             r6,  r1,  #4*32         @ w2
+        str             r4,  [sp, #40]
+
+        sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr
+        sum8            r8,  r9,  r1,  r2,  r10, r11, r12, lr, rsb, 32
+        round           r10, r8,  r9
+        strh_post       r10, r3,  r4
+
+        mov             lr,  #15
+1:
+        ldr             r12, [r0, #4]!
+        ldr             r11, [r6, #-4]!
+        ldr             r10, [r1, #4]!
+  .irpc i, 0246
+    .if \i
+        ldr             r11, [r6, #4*64*\i]
+        ldr             r10, [r1, #4*64*\i]
+    .endif
+        rsb             r11, r11, #0
+        smlal           r8,  r9,  r10, r12
+        ldr             r10, [r0, #4*64*(\i+1)]
+    .ifeq \i
+        smull           r4, r7, r11, r12
+    .else
+        smlal           r4, r7, r11, r12
+    .endif
+        ldr             r11, [r6, #4*64*(\i+1)]
+        ldr             r12, [r1, #4*64*(\i+1)]
+        rsb             r11, r11, #0
+        smlal           r8,  r9,  r12, r10
+    .iflt \i-6
+        ldr             r12, [r0, #4*64*(\i+2)]
+    .else
+        ldr             r12, [r2, #-4]!
+    .endif
+        smlal           r4,  r7,  r11, r10
+  .endr
+  .irpc i, 0246
+        ldr             r10, [r1, #4*64*\i+4*32]
+        rsb             r12, r12, #0
+        ldr             r11, [r6, #4*64*\i+4*32]
+        smlal           r8,  r9,  r10, r12
+        ldr             r10, [r2, #4*64*(\i+1)]
+        smlal           r4,  r7,  r11, r12
+        ldr             r12, [r1, #4*64*(\i+1)+4*32]
+        rsb             r10, r10, #0
+        ldr             r11, [r6, #4*64*(\i+1)+4*32]
+        smlal           r8,  r9,  r12, r10
+    .iflt \i-6
+        ldr             r12, [r2, #4*64*(\i+2)]
+    .else
+        ldr             r12, [sp, #40]
+    .endif
+        smlal           r4,  r7,  r11, r10
+  .endr
+        round           r10, r8,  r9
+        adds            r8,  r8,  r4
+        adc             r9,  r9,  r7
+        strh_post       r10, r3,  r12
+        round           r11, r8,  r9
+        subs            lr,  lr,  #1
+        strh_dpost      r11, r5, r12
+        bgt             1b
+
+        sum8            r8,  r9,  r1,  r0,  r10, r11, r12, lr, rsb, 33
+        pop             {r4}
+        round           r10, r8,  r9
+        str             r8,  [r4]
+        strh            r10, [r3]
+
+        pop             {r4-r11,pc}
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c b/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c
new file mode 100644
index 0000000000..d87bd27ad8
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+#include "config.h"
+
+void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
+                                        int *dither, int16_t *out, ptrdiff_t incr);
+
+av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv6(cpu_flags)) {
+        s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
+    }
+}
diff --git a/media/ffvpx/libavcodec/arm/simple_idct_arm.S b/media/ffvpx/libavcodec/arm/simple_idct_arm.S
new file mode 100644
index 0000000000..42d79ab95e
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/simple_idct_arm.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2002 Frederic 'dilb' Boulay
+ *
+ * Author: Frederic Boulay <dilb@handhelds.org>
+ *
+ * The function defined in this file is derived from the simple_idct function
+ * from the libavcodec library part of the FFmpeg project.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+/* useful constants for the algorithm */
+#define W1  22725
+#define W2  21407
+#define W3  19266
+#define W4  16383
+#define W5  12873
+#define W6  8867
+#define W7  4520
+#define MASK_MSHW 0xFFFF0000
+
+#define ROW_SHIFT 11
+#define ROW_SHIFT2MSHW (16-11)
+#define COL_SHIFT 20
+#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
+#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
+
+
+function ff_simple_idct_arm, export=1
+        @@ void simple_idct_arm(int16_t *block)
+        @@ save stack for reg needed (take all of them),
+        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
+        @@ so it must not be overwritten, if it is not saved!!
+        @@ R12 is another scratch register, so it should not be saved too
+        @@ save all registers
+        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
+        @@ at this point, R0=block, other registers are free.
+        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
+        @@ add 2 temporary variables in the stack: R0 and R14
+        sub sp, sp, #8          @ allow 2 local variables
+        str r0, [sp, #0]        @ save block in sp[0]
+        @@ stack status
+        @@ sp+4   free
+        @@ sp+0   R0  (block)
+
+
+        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
+
+
+__row_loop:
+        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
+        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
+        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
+        ldr r3, [r14, #8]        @ R3=ROWr32[2]
+        ldr r4, [r14, #12]       @ R4=ROWr32[3]
+        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
+        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
+        @@ else follow the complete algorithm.
+        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
+        orr r5, r4, r3           @ R5=R4 | R3
+        orr r5, r5, r2           @ R5=R4 | R3 | R2
+        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
+        beq __end_row_loop
+        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
+        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
+        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
+        beq __almost_empty_row
+
+@@ __b_evaluation:
+        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
+
+        @@ MUL16(b0, W1, row[1]);
+        @@ MUL16(b1, W3, row[1]);
+        @@ MUL16(b2, W5, row[1]);
+        @@ MUL16(b3, W7, row[1]);
+        @@ MAC16(b0, W3, row[3]);
+        @@ MAC16(b1, -W7, row[3]);
+        @@ MAC16(b2, -W1, row[3]);
+        @@ MAC16(b3, -W5, row[3]);
+        ldr r8, =W1              @ R8=W1
+        mov r2, r2, asr #16      @ R2=ROWr16[3]
+        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r9, =W3              @ R9=W3
+        ldr r10, =W5             @ R10=W5
+        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r11, =W7             @ R11=W7
+        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        teq r2, #0               @ if null avoid muls
+        itttt ne
+        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        rsbne r2, r2, #0         @ R2=-ROWr16[3]
+        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        it    ne
+        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+        @@ if (temp != 0) {}
+        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
+        beq __end_b_evaluation
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ MAC16(b0, W5, row[5]);
+        @@ MAC16(b2, W7, row[5]);
+        @@ MAC16(b3, W3, row[5]);
+        @@ MAC16(b1, -W1, row[5]);
+        @@ MAC16(b0, W7, row[7]);
+        @@ MAC16(b2, W3, row[7]);
+        @@ MAC16(b3, -W1, row[7]);
+        @@ MAC16(b1, -W5, row[7]);
+        mov r3, r3, asr #16      @ R3=ROWr16[5]
+        teq r3, #0               @ if null avoid muls
+        it    ne
+        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
+        mov r4, r4, asr #16      @ R4=ROWr16[7]
+        itttt ne
+        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
+        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
+        rsbne r3, r3, #0         @ R3=-ROWr16[5]
+        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
+        @@ R3 is free now
+        teq r4, #0               @ if null avoid muls
+        itttt ne
+        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
+        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
+        rsbne r4, r4, #0         @ R4=-ROWr16[7]
+        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
+        it    ne
+        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
+        @@ R4 is free now
+__end_b_evaluation:
+        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
+        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+@@ __a_evaluation:
+        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+        @@ a1 = a0 + W6 * row[2];
+        @@ a2 = a0 - W6 * row[2];
+        @@ a3 = a0 - W2 * row[2];
+        @@ a0 = a0 + W2 * row[2];
+        ldr r9, =W4              @ R9=W4
+        mul r6, r9, r6           @ R6=W4*ROWr16[0]
+        ldr r10, =W6             @ R10=W6
+        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
+        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
+
+        mul r11, r10, r4         @ R11=W6*ROWr16[2]
+        ldr r8, =W2              @ R8=W2
+        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
+        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+        @@ if (temp != 0) {}
+        teq r2, #0
+        beq __end_bef_a_evaluation
+
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+
+
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+
+        @@ a0 += W4*row[4]
+        @@ a1 -= W4*row[4]
+        @@ a2 -= W4*row[4]
+        @@ a3 += W4*row[4]
+        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
+        teq r11, #0              @ if null avoid muls
+        it    ne
+        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
+        @@ R9 is free now
+        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
+        itttt ne
+        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
+        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
+        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
+        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
+        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+        teq r9, #0               @ if null avoid muls
+        itttt ne
+        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
+        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
+        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
+        @@ a0 += W6*row[6];
+        @@ a3 -= W6*row[6];
+        @@ a1 -= W2*row[6];
+        @@ a2 += W2*row[6];
+        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        itt   ne
+        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
+        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
+
+__end_a_evaluation:
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
+        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
+        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
+        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
+        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
+        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
+        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
+        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
+        add r8, r6, r0           @ R8=a0+b0
+        add r9, r2, r1           @ R9=a1+b1
+        @@ put two 16-bit half-words in a 32-bit word
+        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
+        ldr r10, =MASK_MSHW      @ R10=0xFFFF0000
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
+        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #0]
+
+        add r8, r3, r5           @ R8=a2+b2
+        add r9, r4, r7           @ R9=a3+b3
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #4]
+
+        sub r8, r4, r7           @ R8=a3-b3
+        sub r9, r3, r5           @ R9=a2-b2
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #8]
+
+        sub r8, r2, r1           @ R8=a1-b1
+        sub r9, r6, r0           @ R9=a0-b0
+        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
+        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
+        orr r8, r8, r9
+        str r8, [r14, #12]
+
+        bal __end_row_loop
+
+__almost_empty_row:
+        @@ the row was empty, except ROWr16[0], now, management of this special case
+        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
+        @@                R8=0xFFFF (temp), R9-R11 free
+        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
+        sub r8, r8, #1           @ R8 is now ready.
+        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
+        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
+        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
+        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
+        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
+        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
+
+__end_row_loop:
+        @@ at this point, R0-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        ldr r0, [sp, #0]         @ R0=block
+        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
+        sub r14, r14, #16
+        bne __row_loop
+
+
+
+        @@ at this point, R0=block, R1-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
+__col_loop:
+
+@@ __b_evaluation2:
+        @@ at this point, R0=block (temp),  R1-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ proceed with b0-b3 first, followed by a0-a3
+        @@ MUL16(b0, W1, col[8x1]);
+        @@ MUL16(b1, W3, col[8x1]);
+        @@ MUL16(b2, W5, col[8x1]);
+        @@ MUL16(b3, W7, col[8x1]);
+        @@ MAC16(b0, W3, col[8x3]);
+        @@ MAC16(b1, -W7, col[8x3]);
+        @@ MAC16(b2, -W1, col[8x3]);
+        @@ MAC16(b3, -W5, col[8x3]);
+        ldr r8, =W1              @ R8=W1
+        ldrsh r7, [r14, #16]
+        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r9, =W3              @ R9=W3
+        ldr r10, =W5             @ R10=W5
+        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldr r11, =W7             @ R11=W7
+        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        ldrsh r2, [r14, #48]
+        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+        teq r2, #0               @ if 0, then avoid muls
+        itttt ne
+        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        rsbne r2, r2, #0         @ R2=-ROWr16[3]
+        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+        it    ne
+        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
+        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ MAC16(b0, W5, col[5x8]);
+        @@ MAC16(b2, W7, col[5x8]);
+        @@ MAC16(b3, W3, col[5x8]);
+        @@ MAC16(b1, -W1, col[5x8]);
+        @@ MAC16(b0, W7, col[7x8]);
+        @@ MAC16(b2, W3, col[7x8]);
+        @@ MAC16(b3, -W1, col[7x8]);
+        @@ MAC16(b1, -W5, col[7x8]);
+        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
+        teq r3, #0               @ if 0 then avoid muls
+        itttt ne
+        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
+        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
+        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
+        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
+        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
+        it    ne
+        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
+        @@ R3 is free now
+        teq r4, #0               @ if 0 then avoid muls
+        itttt ne
+        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
+        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
+        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
+        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
+        it    ne
+        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
+        @@ R4 is free now
+@@ __end_b_evaluation2:
+        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
+        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+
+@@ __a_evaluation2:
+        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
+        @@ a1 = a0 + W6 * row[2];
+        @@ a2 = a0 - W6 * row[2];
+        @@ a3 = a0 - W2 * row[2];
+        @@ a0 = a0 + W2 * row[2];
+        ldrsh r6, [r14, #0]
+        ldr r9, =W4              @ R9=W4
+        mul r6, r9, r6           @ R6=W4*ROWr16[0]
+        ldr r10, =W6             @ R10=W6
+        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
+        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
+        mul r11, r10, r4         @ R11=W6*ROWr16[2]
+        ldr r8, =W2              @ R8=W2
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ a0 += W4*row[4]
+        @@ a1 -= W4*row[4]
+        @@ a2 -= W4*row[4]
+        @@ a3 += W4*row[4]
+        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
+        teq r11, #0              @ if null avoid muls
+        itttt ne
+        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
+        @@ R9 is free now
+        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
+        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
+        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
+        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
+        it    ne
+        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
+        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+        teq r9, #0               @ if null avoid muls
+        itttt ne
+        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
+        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
+        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
+        @@ a0 += W6*row[6];
+        @@ a3 -= W6*row[6];
+        @@ a1 -= W2*row[6];
+        @@ a2 += W2*row[6];
+        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
+        itt   ne
+        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
+        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
+@@ __end_a_evaluation2:
+        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
+        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+        @@     R12=__const_ptr_, R14=&block[n]
+        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
+        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
+        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
+        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
+        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
+        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
+        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
+        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
+        @@@@@ no optimization here @@@@@
+        add r8, r6, r0           @ R8=a0+b0
+        add r9, r2, r1           @ R9=a1+b1
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #0]
+        strh r9, [r14, #16]
+        add r8, r3, r5           @ R8=a2+b2
+        add r9, r4, r7           @ R9=a3+b3
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #32]
+        strh r9, [r14, #48]
+        sub r8, r4, r7           @ R8=a3-b3
+        sub r9, r3, r5           @ R9=a2-b2
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #64]
+        strh r9, [r14, #80]
+        sub r8, r2, r1           @ R8=a1-b1
+        sub r9, r6, r0           @ R9=a0-b0
+        mov r8, r8, asr #COL_SHIFT
+        mov r9, r9, asr #COL_SHIFT
+        strh r8, [r14, #96]
+        strh r9, [r14, #112]
+
+@@ __end_col_loop:
+        @@ at this point, R0-R11 (free)
+        @@     R12=__const_ptr_, R14=&block[n]
+        ldr r0, [sp, #0]         @ R0=block
+        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
+        sub r14, r14, #2
+        bne __col_loop
+
+
+
+
+@@ __end_simple_idct_arm:
+        @@ restore registers to previous status!
+        add sp, sp, #8 @@ the local variables!
+        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
+
+
+
+@@ kind of sub-function, here not to overload the common case.
+__end_bef_a_evaluation:
+        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
+        mul r11, r8, r4          @ R11=W2*ROWr16[2]
+        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
+        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
+        bal __end_a_evaluation
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S b/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S
new file mode 100644
index 0000000000..a8d03469ab
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/simple_idct_armv5te.S
@@ -0,0 +1,613 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+function idct_row_armv5te
+        str    lr, [sp, #-4]!
+
+        ldrd   v1, v2, [a1, #8]
+        ldrd   a3, a4, [a1]          /* a3 = row[1:0], a4 = row[3:2] */
+        orrs   v1, v1, v2
+        itt    eq
+        cmpeq  v1, a4
+        cmpeq  v1, a3, lsr #16
+        beq    row_dc_only
+
+        mov    v1, #(1<<(ROW_SHIFT-1))
+        mov    ip, #16384
+        sub    ip, ip, #1            /* ip = W4 */
+        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
+        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
+        smultb a2, ip, a4
+        smulbb lr, ip, a4
+        add    v2, v1, a2
+        sub    v3, v1, a2
+        sub    v4, v1, lr
+        add    v1, v1, lr
+
+        ldr    ip, =W13              /* ip = W1 | (W3 << 16) */
+        ldr    lr, =W57              /* lr = W5 | (W7 << 16) */
+        smulbt v5, ip, a3
+        smultt v6, lr, a4
+        smlatt v5, ip, a4, v5
+        smultt a2, ip, a3
+        smulbt v7, lr, a3
+        sub    v6, v6, a2
+        smulbt a2, ip, a4
+        smultt fp, lr, a3
+        sub    v7, v7, a2
+        smulbt a2, lr, a4
+        ldrd   a3, a4, [a1, #8]     /* a3=row[5:4] a4=row[7:6] */
+        sub    fp, fp, a2
+
+        orrs   a2, a3, a4
+        beq    1f
+
+        smlabt v5, lr, a3, v5
+        smlabt v6, ip, a3, v6
+        smlatt v5, lr, a4, v5
+        smlabt v6, lr, a4, v6
+        smlatt v7, lr, a3, v7
+        smlatt fp, ip, a3, fp
+        smulbt a2, ip, a4
+        smlatt v7, ip, a4, v7
+        sub    fp, fp, a2
+
+        ldr    ip, =W26              /* ip = W2 | (W6 << 16) */
+        mov    a2, #16384
+        sub    a2, a2, #1            /* a2 =  W4 */
+        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
+        smultb lr, ip, a4            /* lr =  W6*row[6] */
+        add    v1, v1, a2            /* v1 += W4*row[4] */
+        add    v1, v1, lr            /* v1 += W6*row[6] */
+        add    v4, v4, a2            /* v4 += W4*row[4] */
+        sub    v4, v4, lr            /* v4 -= W6*row[6] */
+        smulbb lr, ip, a4            /* lr =  W2*row[6] */
+        sub    v2, v2, a2            /* v2 -= W4*row[4] */
+        sub    v2, v2, lr            /* v2 -= W2*row[6] */
+        sub    v3, v3, a2            /* v3 -= W4*row[4] */
+        add    v3, v3, lr            /* v3 += W2*row[6] */
+
+1:      add    a2, v1, v5
+        mov    a3, a2, lsr #11
+        bic    a3, a3, #0x1f0000
+        sub    a2, v2, v6
+        mov    a2, a2, lsr #11
+        add    a3, a3, a2, lsl #16
+        add    a2, v3, v7
+        mov    a4, a2, lsr #11
+        bic    a4, a4, #0x1f0000
+        add    a2, v4, fp
+        mov    a2, a2, lsr #11
+        add    a4, a4, a2, lsl #16
+        strd   a3, a4, [a1]
+
+        sub    a2, v4, fp
+        mov    a3, a2, lsr #11
+        bic    a3, a3, #0x1f0000
+        sub    a2, v3, v7
+        mov    a2, a2, lsr #11
+        add    a3, a3, a2, lsl #16
+        add    a2, v2, v6
+        mov    a4, a2, lsr #11
+        bic    a4, a4, #0x1f0000
+        sub    a2, v1, v5
+        mov    a2, a2, lsr #11
+        add    a4, a4, a2, lsl #16
+        strd   a3, a4, [a1, #8]
+
+        ldr    pc, [sp], #4
+
+row_dc_only:
+        orr    a3, a3, a3, lsl #16
+        bic    a3, a3, #0xe000
+        mov    a3, a3, lsl #3
+        mov    a4, a3
+        strd   a3, a4, [a1]
+        strd   a3, a4, [a1, #8]
+
+        ldr    pc, [sp], #4
+endfunc
+
+        .macro idct_col
+        ldr    a4, [a1]              /* a4 = col[1:0] */
+        mov    ip, #16384
+        sub    ip, ip, #1            /* ip = W4 */
+        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
+        add    v2, v1, a4, asr #16
+        rsb    v2, v2, v2, lsl #14
+        mov    a4, a4, lsl #16
+        add    v1, v1, a4, asr #16
+        ldr    a4, [a1, #(16*4)]
+        rsb    v1, v1, v1, lsl #14
+
+        smulbb lr, ip, a4
+        smulbt a3, ip, a4
+        sub    v3, v1, lr
+        sub    v5, v1, lr
+        add    v7, v1, lr
+        add    v1, v1, lr
+        sub    v4, v2, a3
+        sub    v6, v2, a3
+        add    fp, v2, a3
+        ldr    ip, =W26
+        ldr    a4, [a1, #(16*2)]
+        add    v2, v2, a3
+
+        smulbb lr, ip, a4
+        smultb a3, ip, a4
+        add    v1, v1, lr
+        sub    v7, v7, lr
+        add    v3, v3, a3
+        sub    v5, v5, a3
+        smulbt lr, ip, a4
+        smultt a3, ip, a4
+        add    v2, v2, lr
+        sub    fp, fp, lr
+        add    v4, v4, a3
+        ldr    a4, [a1, #(16*6)]
+        sub    v6, v6, a3
+
+        smultb lr, ip, a4
+        smulbb a3, ip, a4
+        add    v1, v1, lr
+        sub    v7, v7, lr
+        sub    v3, v3, a3
+        add    v5, v5, a3
+        smultt lr, ip, a4
+        smulbt a3, ip, a4
+        add    v2, v2, lr
+        sub    fp, fp, lr
+        sub    v4, v4, a3
+        add    v6, v6, a3
+
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
+
+        ldr    ip, =W13
+        ldr    a4, [a1, #(16*1)]
+        ldr    lr, =W57
+        smulbb v1, ip, a4
+        smultb v3, ip, a4
+        smulbb v5, lr, a4
+        smultb v7, lr, a4
+        smulbt v2, ip, a4
+        smultt v4, ip, a4
+        smulbt v6, lr, a4
+        smultt fp, lr, a4
+        rsb    v4, v4, #0
+        ldr    a4, [a1, #(16*3)]
+        rsb    v3, v3, #0
+
+        smlatb v1, ip, a4, v1
+        smlatb v3, lr, a4, v3
+        smulbb a3, ip, a4
+        smulbb a2, lr, a4
+        sub    v5, v5, a3
+        sub    v7, v7, a2
+        smlatt v2, ip, a4, v2
+        smlatt v4, lr, a4, v4
+        smulbt a3, ip, a4
+        smulbt a2, lr, a4
+        sub    v6, v6, a3
+        ldr    a4, [a1, #(16*5)]
+        sub    fp, fp, a2
+
+        smlabb v1, lr, a4, v1
+        smlabb v3, ip, a4, v3
+        smlatb v5, lr, a4, v5
+        smlatb v7, ip, a4, v7
+        smlabt v2, lr, a4, v2
+        smlabt v4, ip, a4, v4
+        smlatt v6, lr, a4, v6
+        ldr    a3, [a1, #(16*7)]
+        smlatt fp, ip, a4, fp
+
+        smlatb v1, lr, a3, v1
+        smlabb v3, lr, a3, v3
+        smlatb v5, ip, a3, v5
+        smulbb a4, ip, a3
+        smlatt v2, lr, a3, v2
+        sub    v7, v7, a4
+        smlabt v4, lr, a3, v4
+        smulbt a4, ip, a3
+        smlatt v6, ip, a3, v6
+        sub    fp, fp, a4
+        .endm
+
+function idct_col_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldmfd  sp!, {a3, a4}
+        adds   a2, a3, v1
+        mov    a2, a2, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, v2
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1]
+        subs   a3, a3, v1
+        mov    a2, a3, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, v2
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*7)]
+
+        subs   a2, a3, v3
+        mov    a2, a2, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        sub    ip, a4, v4
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*1)]
+        adds   a3, a3, v3
+        mov    a2, a3, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        add    a4, a4, v4
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*6)]
+
+        adds   a2, a3, v5
+        mov    a2, a2, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, v6
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*2)]
+        subs   a3, a3, v5
+        mov    a2, a3, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, v6
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        ldmfd  sp!, {a3, a4}
+        str    a2, [a1, #(16*5)]
+
+        adds   a2, a3, v7
+        mov    a2, a2, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        add    ip, a4, fp
+        mov    ip, ip, asr #20
+        orr    a2, a2, ip, lsl #16
+        str    a2, [a1, #(16*3)]
+        subs   a3, a3, v7
+        mov    a2, a3, lsr #20
+        it     mi
+        orrmi  a2, a2, #0xf000
+        sub    a4, a4, fp
+        mov    a4, a4, asr #20
+        orr    a2, a2, a4, lsl #16
+        str    a2, [a1, #(16*4)]
+
+        ldr    pc, [sp], #4
+endfunc
+
+.macro  clip   dst, src:vararg
+        movs   \dst, \src
+        it     mi
+        movmi  \dst, #0
+        cmp    \dst, #255
+        it     gt
+        movgt  \dst, #255
+.endm
+
+.macro  aclip  dst, src:vararg
+        adds   \dst, \src
+        it     mi
+        movmi  \dst, #0
+        cmp    \dst, #255
+        it     gt
+        movgt  \dst, #255
+.endm
+
+function idct_col_put_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldmfd  sp!, {a3, a4}
+        ldr    lr, [sp, #32]
+        add    a2, a3, v1
+        clip   a2, a2, asr #20
+        add    ip, a4, v2
+        clip   ip, ip, asr #20
+        orr    a2, a2, ip, lsl #8
+        sub    a3, a3, v1
+        clip   a3, a3, asr #20
+        sub    a4, a4, v2
+        clip   a4, a4, asr #20
+        ldr    v1, [sp, #28]
+        strh   a2, [v1]
+        add    a2, v1, #2
+        str    a2, [sp, #28]
+        orr    a2, a3, a4, lsl #8
+        rsb    v2, lr, lr, lsl #3
+        ldmfd  sp!, {a3, a4}
+        strh_pre a2, v2, v1
+
+        sub    a2, a3, v3
+        clip   a2, a2, asr #20
+        sub    ip, a4, v4
+        clip   ip, ip, asr #20
+        orr    a2, a2, ip, lsl #8
+        strh_pre a2, v1, lr
+        add    a3, a3, v3
+        clip   a2, a3, asr #20
+        add    a4, a4, v4
+        clip   a4, a4, asr #20
+        orr    a2, a2, a4, lsl #8
+        ldmfd  sp!, {a3, a4}
+        strh_dpre a2, v2, lr
+
+        add    a2, a3, v5
+        clip   a2, a2, asr #20
+        add    ip, a4, v6
+        clip   ip, ip, asr #20
+        orr    a2, a2, ip, lsl #8
+        strh_pre a2, v1, lr
+        sub    a3, a3, v5
+        clip   a2, a3, asr #20
+        sub    a4, a4, v6
+        clip   a4, a4, asr #20
+        orr    a2, a2, a4, lsl #8
+        ldmfd  sp!, {a3, a4}
+        strh_dpre a2, v2, lr
+
+        add    a2, a3, v7
+        clip   a2, a2, asr #20
+        add    ip, a4, fp
+        clip   ip, ip, asr #20
+        orr    a2, a2, ip, lsl #8
+        strh   a2, [v1, lr]
+        sub    a3, a3, v7
+        clip   a2, a3, asr #20
+        sub    a4, a4, fp
+        clip   a4, a4, asr #20
+        orr    a2, a2, a4, lsl #8
+        strh_dpre a2, v2, lr
+
+        ldr    pc, [sp], #4
+endfunc
+
+function idct_col_add_armv5te
+        str    lr, [sp, #-4]!
+
+        idct_col
+
+        ldr    lr, [sp, #36]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh   ip, [lr]
+        add    a2, a3, v1
+        sub    a3, a3, v1
+        and    v1, ip, #255
+        aclip  a2, v1, a2, asr #20
+        add    v1, a4, v2
+        mov    v1, v1, asr #20
+        aclip  v1, v1, ip, lsr #8
+        orr    a2, a2, v1, lsl #8
+        ldr    v1, [sp, #32]
+        sub    a4, a4, v2
+        rsb    v2, v1, v1, lsl #3
+        ldrh_pre ip, v2, lr
+        strh   a2, [lr]
+        and    a2, ip, #255
+        aclip  a3, a2, a3, asr #20
+        mov    a4, a4, asr #20
+        aclip  a4, a4, ip, lsr #8
+        add    a2, lr, #2
+        str    a2, [sp, #28]
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh_pre ip, lr, v1
+        sub    a2, a3, v3
+        add    a3, a3, v3
+        and    v3, ip, #255
+        aclip  a2, v3, a2, asr #20
+        sub    v3, a4, v4
+        mov    v3, v3, asr #20
+        aclip  v3, v3, ip, lsr #8
+        orr    a2, a2, v3, lsl #8
+        add    a4, a4, v4
+        ldrh_dpre ip, v2, v1
+        strh   a2, [lr]
+        and    a2, ip, #255
+        aclip  a3, a2, a3, asr #20
+        mov    a4, a4, asr #20
+        aclip  a4, a4, ip, lsr #8
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh_pre ip, lr, v1
+        add    a2, a3, v5
+        sub    a3, a3, v5
+        and    v3, ip, #255
+        aclip  a2, v3, a2, asr #20
+        add    v3, a4, v6
+        mov    v3, v3, asr #20
+        aclip  v3, v3, ip, lsr #8
+        orr    a2, a2, v3, lsl #8
+        sub    a4, a4, v6
+        ldrh_dpre ip, v2, v1
+        strh   a2, [lr]
+        and    a2, ip, #255
+        aclip  a3, a2, a3, asr #20
+        mov    a4, a4, asr #20
+        aclip  a4, a4, ip, lsr #8
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldmfd  sp!, {a3, a4}
+        ldrh_pre ip, lr, v1
+        add    a2, a3, v7
+        sub    a3, a3, v7
+        and    v3, ip, #255
+        aclip  a2, v3, a2, asr #20
+        add    v3, a4, fp
+        mov    v3, v3, asr #20
+        aclip  v3, v3, ip, lsr #8
+        orr    a2, a2, v3, lsl #8
+        sub    a4, a4, fp
+        ldrh_dpre ip, v2, v1
+        strh   a2, [lr]
+        and    a2, ip, #255
+        aclip  a3, a2, a3, asr #20
+        mov    a4, a4, asr #20
+        aclip  a4, a4, ip, lsr #8
+        orr    a2, a3, a4, lsl #8
+        strh   a2, [v2]
+
+        ldr    pc, [sp], #4
+endfunc
+
+function ff_simple_idct_armv5te, export=1
+        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+        add    a1, a1, #4
+        bl     idct_col_armv5te
+
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_add_armv5te, export=1
+        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        mov    a1, a3
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+        add    a1, a1, #4
+        bl     idct_col_add_armv5te
+
+        add    sp, sp, #8
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_put_armv5te, export=1
+        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+        mov    a1, a3
+
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+        add    a1, a1, #16
+        bl     idct_row_armv5te
+
+        sub    a1, a1, #(16*7)
+
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+        add    a1, a1, #4
+        bl     idct_col_put_armv5te
+
+        add    sp, sp, #8
+        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/simple_idct_armv6.S b/media/ffvpx/libavcodec/arm/simple_idct_armv6.S
new file mode 100644
index 0000000000..f95c20d295
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/simple_idct_armv6.S
@@ -0,0 +1,425 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W42 (W4 | (W2 << 16))
+#define W42n (-W4&0xffff | (-W2 << 16))
+#define W46 (W4 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+/*
+  Compute partial IDCT of single row.
+  shift = left-shift amount
+  r0 = source address
+  r2 = row[2,0] <= 2 cycles
+  r3 = row[3,1]
+  ip = w42      <= 2 cycles
+
+  Output in registers r4--r11
+*/
+        .macro idct_row shift
+        ldr    lr, =W46              /* lr  = W4 | (W6 << 16) */
+        mov    r1, #(1<<(\shift-1))
+        smlad  r4, r2, ip, r1
+        smlsd  r7, r2, ip, r1
+        ldr    ip, =W13              /* ip  = W1 | (W3 << 16) */
+        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
+        smlad  r5, r2, lr, r1
+        smlsd  r6, r2, lr, r1
+
+        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
+        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
+        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
+        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
+        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
+        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
+        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
+        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
+
+        ldr    r3, =W42n             /* r3 =  -W4 | (-W2 << 16) */
+        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
+        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
+        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
+        ldr    ip, =W46              /* ip =   W4 | (W6 << 16) */
+        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
+
+        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
+        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
+        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
+        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
+        .endm
+
+/*
+  Compute partial IDCT of half row.
+  shift = left-shift amount
+  r2 = row[2,0]
+  r3 = row[3,1]
+  ip = w42
+
+  Output in registers r4--r11
+*/
+        .macro idct_row4 shift
+        ldr    lr, =W46              /* lr =  W4 | (W6 << 16) */
+        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
+        mov    r1, #(1<<(\shift-1))
+        smlad  r4, r2, ip, r1
+        smlsd  r7, r2, ip, r1
+        ldr    ip, =W13              /* ip =  W1 | (W3 << 16) */
+        smlad  r5, r2, lr, r1
+        smlsd  r6, r2, lr, r1
+        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
+        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
+        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
+        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
+        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
+        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
+        .endm
+
+/*
+  Compute final part of IDCT single row without shift.
+  Input in registers r4--r11
+  Output in registers ip, r4--r6, lr, r8--r10
+*/
+        .macro idct_finish
+        add    ip, r4, r8            /* r1 = A0 + B0 */
+        sub    lr, r4, r8            /* r2 = A0 - B0 */
+        sub    r4, r5, r9            /* r2 = A1 + B1 */
+        add    r8, r5, r9            /* r2 = A1 - B1 */
+        add    r5, r6, r10           /* r1 = A2 + B2 */
+        sub    r9, r6, r10           /* r1 = A2 - B2 */
+        add    r6, r7, r11           /* r2 = A3 + B3 */
+        sub    r10,r7, r11           /* r2 = A3 - B3 */
+        .endm
+
+/*
+  Compute final part of IDCT single row.
+  shift = right-shift amount
+  Input/output in registers r4--r11
+*/
+        .macro idct_finish_shift shift
+        add    r3, r4, r8            /* r3 = A0 + B0 */
+        sub    r2, r4, r8            /* r2 = A0 - B0 */
+        mov    r4, r3, asr #\shift
+        mov    r8, r2, asr #\shift
+
+        sub    r3, r5, r9            /* r3 = A1 + B1 */
+        add    r2, r5, r9            /* r2 = A1 - B1 */
+        mov    r5, r3, asr #\shift
+        mov    r9, r2, asr #\shift
+
+        add    r3, r6, r10           /* r3 = A2 + B2 */
+        sub    r2, r6, r10           /* r2 = A2 - B2 */
+        mov    r6, r3, asr #\shift
+        mov    r10,r2, asr #\shift
+
+        add    r3, r7, r11           /* r3 = A3 + B3 */
+        sub    r2, r7, r11           /* r2 = A3 - B3 */
+        mov    r7, r3, asr #\shift
+        mov    r11,r2, asr #\shift
+        .endm
+
+/*
+  Compute final part of IDCT single row, saturating results at 8 bits.
+  shift = right-shift amount
+  Input/output in registers r4--r11
+*/
+        .macro idct_finish_shift_sat shift
+        add    r3, r4, r8            /* r3 = A0 + B0 */
+        sub    ip, r4, r8            /* ip = A0 - B0 */
+        usat   r4, #8, r3, asr #\shift
+        usat   r8, #8, ip, asr #\shift
+
+        sub    r3, r5, r9            /* r3 = A1 + B1 */
+        add    ip, r5, r9            /* ip = A1 - B1 */
+        usat   r5, #8, r3, asr #\shift
+        usat   r9, #8, ip, asr #\shift
+
+        add    r3, r6, r10           /* r3 = A2 + B2 */
+        sub    ip, r6, r10           /* ip = A2 - B2 */
+        usat   r6, #8, r3, asr #\shift
+        usat   r10,#8, ip, asr #\shift
+
+        add    r3, r7, r11           /* r3 = A3 + B3 */
+        sub    ip, r7, r11           /* ip = A3 - B3 */
+        usat   r7, #8, r3, asr #\shift
+        usat   r11,#8, ip, asr #\shift
+        .endm
+
+/*
+  Compute IDCT of single row, storing as column.
+  r0 = source
+  r1 = dest
+*/
+function idct_row_armv6
+        push   {lr}
+
+        ldr    lr, [r0, #12]         /* lr = row[7,5] */
+        ldr    ip, [r0, #4]          /* ip = row[6,4] */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        orrs   lr, lr, ip
+        itt    eq
+        cmpeq  lr, r3
+        cmpeq  lr, r2, lsr #16
+        beq    1f
+        push   {r1}
+        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
+        cmp    lr, #0
+        beq    2f
+
+        idct_row   ROW_SHIFT
+        b      3f
+
+2:      idct_row4  ROW_SHIFT
+
+3:      pop    {r1}
+        idct_finish_shift ROW_SHIFT
+
+        strh   r4, [r1]
+        strh   r5, [r1, #(16*2)]
+        strh   r6, [r1, #(16*4)]
+        strh   r7, [r1, #(16*6)]
+        strh   r11,[r1, #(16*1)]
+        strh   r10,[r1, #(16*3)]
+        strh   r9, [r1, #(16*5)]
+        strh   r8, [r1, #(16*7)]
+
+        pop    {pc}
+
+1:      mov    r2, r2, lsl #3
+        strh   r2, [r1]
+        strh   r2, [r1, #(16*2)]
+        strh   r2, [r1, #(16*4)]
+        strh   r2, [r1, #(16*6)]
+        strh   r2, [r1, #(16*1)]
+        strh   r2, [r1, #(16*3)]
+        strh   r2, [r1, #(16*5)]
+        strh   r2, [r1, #(16*7)]
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row.
+  r0 = source
+  r1 = dest
+*/
+function idct_col_armv6
+        push   {r1, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1}
+        idct_finish_shift COL_SHIFT
+
+        strh   r4, [r1]
+        strh   r5, [r1, #(16*1)]
+        strh   r6, [r1, #(16*2)]
+        strh   r7, [r1, #(16*3)]
+        strh   r11,[r1, #(16*4)]
+        strh   r10,[r1, #(16*5)]
+        strh   r9, [r1, #(16*6)]
+        strh   r8, [r1, #(16*7)]
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row, store saturated 8-bit.
+  r0 = source
+  r1 = dest
+  r2 = line size
+*/
+function idct_col_put_armv6
+        push   {r1, r2, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1, r2}
+        idct_finish_shift_sat COL_SHIFT
+
+        strb_post r4, r1, r2
+        strb_post r5, r1, r2
+        strb_post r6, r1, r2
+        strb_post r7, r1, r2
+        strb_post r11,r1, r2
+        strb_post r10,r1, r2
+        strb_post r9, r1, r2
+        strb_post r8, r1, r2
+
+        sub    r1, r1, r2, lsl #3
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute IDCT of single column, read as row, add/store saturated 8-bit.
+  r0 = source
+  r1 = dest
+  r2 = line size
+*/
+function idct_col_add_armv6
+        push   {r1, r2, lr}
+
+        ldr    r2, [r0]              /* r2 = row[2,0] */
+        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
+        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
+        idct_row COL_SHIFT
+        pop    {r1, r2}
+        idct_finish
+
+        ldrb   r3, [r1]
+        ldrb   r7, [r1, r2]
+        ldrb   r11,[r1, r2, lsl #2]
+        add    ip, r3, ip, asr #COL_SHIFT
+        usat   ip, #8, ip
+        add    r4, r7, r4, asr #COL_SHIFT
+        strb_post ip, r1, r2
+        ldrb   ip, [r1, r2]
+        usat   r4, #8, r4
+        ldrb   r11,[r1, r2, lsl #2]
+        add    r5, ip, r5, asr #COL_SHIFT
+        usat   r5, #8, r5
+        strb_post r4, r1, r2
+        ldrb   r3, [r1, r2]
+        ldrb   ip, [r1, r2, lsl #2]
+        strb_post r5, r1, r2
+        ldrb   r7, [r1, r2]
+        ldrb   r4, [r1, r2, lsl #2]
+        add    r6, r3, r6, asr #COL_SHIFT
+        usat   r6, #8, r6
+        add    r10,r7, r10,asr #COL_SHIFT
+        usat   r10,#8, r10
+        add    r9, r11,r9, asr #COL_SHIFT
+        usat   r9, #8, r9
+        add    r8, ip, r8, asr #COL_SHIFT
+        usat   r8, #8, r8
+        add    lr, r4, lr, asr #COL_SHIFT
+        usat   lr, #8, lr
+        strb_post r6, r1, r2
+        strb_post r10,r1, r2
+        strb_post r9, r1, r2
+        strb_post r8, r1, r2
+        strb_post lr, r1, r2
+
+        sub    r1, r1, r2, lsl #3
+
+        pop    {pc}
+endfunc
+
+/*
+  Compute 8 IDCT row transforms.
+  func = IDCT row->col function
+  width = width of columns in bytes
+*/
+        .macro idct_rows func width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        sub    r0, r0, #(16*5)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+        add    r0, r0, #(16*2)
+        add    r1, r1, #\width
+        bl     \func
+
+        sub    r0, r0, #(16*7)
+        .endm
+
+/* void ff_simple_idct_armv6(int16_t *data); */
+function ff_simple_idct_armv6, export=1
+        push   {r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r1, r0
+        mov    r0, sp
+        idct_rows idct_col_armv6, 2
+
+        add    sp, sp, #128
+        pop    {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
+function ff_simple_idct_add_armv6, export=1
+        push   {r0, r1, r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r0, r2
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r0, sp
+        ldr    r1, [sp, #128]
+        ldr    r2, [sp, #(128+4)]
+        idct_rows idct_col_add_armv6, 1
+
+        add    sp, sp, #(128+8)
+        pop    {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
+function ff_simple_idct_put_armv6, export=1
+        push   {r0, r1, r4-r11, lr}
+        sub    sp, sp, #128
+
+        mov    r0, r2
+        mov    r1, sp
+        idct_rows idct_row_armv6, 2
+        mov    r0, sp
+        ldr    r1, [sp, #128]
+        ldr    r2, [sp, #(128+4)]
+        idct_rows idct_col_put_armv6, 1
+
+        add    sp, sp, #(128+8)
+        pop    {r4-r11, pc}
+endfunc
diff --git a/media/ffvpx/libavcodec/arm/simple_idct_neon.S b/media/ffvpx/libavcodec/arm/simple_idct_neon.S
new file mode 100644
index 0000000000..726d4cbefa
--- /dev/null
+++ b/media/ffvpx/libavcodec/arm/simple_idct_neon.S
@@ -0,0 +1,375 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+        .macro idct_col4_top
+        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
+        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
+        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
+        vadd.i32        q11, q15, q7
+        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
+        vadd.i32        q12, q15, q8
+        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
+        vsub.i32        q13, q15, q8
+        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
+        vsub.i32        q14, q15, q7
+
+        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
+        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
+        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
+        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
+        .endm
+
+        .text
+        .align 6
+
+function idct_row4_pld_neon
+        pld             [r0]
+        add             r3,  r0,  r1,  lsl #2
+        pld             [r0, r1]
+        pld             [r0, r1, lsl #1]
+A       pld             [r3, -r1]
+        pld             [r3]
+        pld             [r3, r1]
+        add             r3,  r3,  r1,  lsl #1
+        pld             [r3]
+        pld             [r3, r1]
+endfunc
+
+function idct_row4_neon
+        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
+        vld1.64         {d2-d5},  [r2,:128]!
+        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
+        vld1.64         {d6,d7},  [r2,:128]!
+        vorr            d10, d3,  d5
+        vld1.64         {d8,d9},  [r2,:128]!
+        add             r2,  r2,  #-64
+
+        vorr            d11, d7,  d9
+        vorr            d10, d10, d11
+        vmov            r3,  r4,  d10
+
+        idct_col4_top
+
+        orrs            r3,  r3,  r4
+        beq             1f
+
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+        vsub.i32        q14, q14, q7
+
+1:      vadd.i32        q3,  q11, q9
+        vadd.i32        q4,  q12, q10
+        vshrn.i32       d2,  q3,  #ROW_SHIFT
+        vshrn.i32       d4,  q4,  #ROW_SHIFT
+        vadd.i32        q7,  q13, q5
+        vadd.i32        q8,  q14, q6
+        vtrn.16         d2,  d4
+        vshrn.i32       d6,  q7,  #ROW_SHIFT
+        vshrn.i32       d8,  q8,  #ROW_SHIFT
+        vsub.i32        q14, q14, q6
+        vsub.i32        q11, q11, q9
+        vtrn.16         d6,  d8
+        vsub.i32        q13, q13, q5
+        vshrn.i32       d3,  q14, #ROW_SHIFT
+        vtrn.32         d2,  d6
+        vsub.i32        q12, q12, q10
+        vtrn.32         d4,  d8
+        vshrn.i32       d5,  q13, #ROW_SHIFT
+        vshrn.i32       d7,  q12, #ROW_SHIFT
+        vshrn.i32       d9,  q11, #ROW_SHIFT
+
+        vtrn.16         d3,  d5
+        vtrn.16         d7,  d9
+        vtrn.32         d3,  d7
+        vtrn.32         d5,  d9
+
+        vst1.64         {d2-d5},  [r2,:128]!
+        vst1.64         {d6-d9},  [r2,:128]!
+
+        bx              lr
+endfunc
+
+function idct_col4_neon
+        mov             ip,  #16
+        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
+        vdup.16         d30, w4c
+        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
+        vadd.i16        d30, d30, d2
+        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
+        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
+
+        ldrd            r4,  r5,  [r2]
+        ldrd            r6,  r7,  [r2, #16]
+        orrs            r4,  r4,  r5
+
+        idct_col4_top
+        it              eq
+        addeq           r2,  r2,  #16
+        beq             1f
+
+        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+
+1:      orrs            r6,  r6,  r7
+        ldrd            r4,  r5,  [r2, #16]
+        it              eq
+        addeq           r2,  r2,  #16
+        beq             2f
+
+        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+
+2:      orrs            r4,  r4,  r5
+        ldrd            r4,  r5,  [r2, #16]
+        it              eq
+        addeq           r2,  r2,  #16
+        beq             3f
+
+        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q14, q14, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+
+3:      orrs            r4,  r4,  r5
+        it              eq
+        addeq           r2,  r2,  #16
+        beq             4f
+
+        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+
+4:      vaddhn.i32      d2,  q11, q9
+        vaddhn.i32      d3,  q12, q10
+        vaddhn.i32      d4,  q13, q5
+        vaddhn.i32      d5,  q14, q6
+        vsubhn.i32      d9,  q11, q9
+        vsubhn.i32      d8,  q12, q10
+        vsubhn.i32      d7,  q13, q5
+        vsubhn.i32      d6,  q14, q6
+
+        bx              lr
+endfunc
+
+        .align 6
+
+function idct_col4_st8_neon
+        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
+        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
+        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
+        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
+        vst1.32         {d2[0]}, [r0,:32], r1
+        vst1.32         {d2[1]}, [r0,:32], r1
+        vst1.32         {d3[0]}, [r0,:32], r1
+        vst1.32         {d3[1]}, [r0,:32], r1
+        vst1.32         {d4[0]}, [r0,:32], r1
+        vst1.32         {d4[1]}, [r0,:32], r1
+        vst1.32         {d5[0]}, [r0,:32], r1
+        vst1.32         {d5[1]}, [r0,:32], r1
+
+        bx              lr
+endfunc
+
+const   idct_coeff_neon, align=4
+        .short W1, W2, W3, W4, W5, W6, W7, W4c
+endconst
+
+        .macro idct_start data
+        push            {r4-r7, lr}
+        pld             [\data]
+        pld             [\data, #64]
+        vpush           {d8-d15}
+        movrel          r3,  idct_coeff_neon
+        vld1.64         {d0,d1}, [r3,:128]
+        .endm
+
+        .macro idct_end
+        vpop            {d8-d15}
+        pop             {r4-r7, pc}
+        .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */
+function ff_simple_idct_put_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_add8_neon
+        mov             ip,  r0
+
+        vld1.32         {d10[0]}, [r0,:32], r1
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vld1.32         {d11[0]}, [r0,:32], r1
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vld1.32         {d11[1]}, [r0,:32], r1
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vld1.32         {d12[0]}, [r0,:32], r1
+        vaddw.u8        q1,  q1,  d10
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vaddw.u8        q2,  q2,  d11
+        vld1.32         {d13[0]}, [r0,:32], r1
+        vqmovun.s16     d2,  q1
+        vld1.32         {d13[1]}, [r0,:32], r1
+        vaddw.u8        q3,  q3,  d12
+        vst1.32         {d2[0]},  [ip,:32], r1
+        vqmovun.s16     d3,  q2
+        vst1.32         {d2[1]},  [ip,:32], r1
+        vaddw.u8        q4,  q4,  d13
+        vst1.32         {d3[0]},  [ip,:32], r1
+        vqmovun.s16     d4,  q3
+        vst1.32         {d3[1]},  [ip,:32], r1
+        vqmovun.s16     d5,  q4
+        vst1.32         {d4[0]},  [ip,:32], r1
+        vst1.32         {d4[1]},  [ip,:32], r1
+        vst1.32         {d5[0]},  [ip,:32], r1
+        vst1.32         {d5[1]},  [ip,:32], r1
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, ptrdiff_t line_size, int16_t *data); */
+function ff_simple_idct_add_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_st16_neon
+        mov             ip,  #16
+
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vst1.64         {d2}, [r2,:64], ip
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vst1.64         {d3}, [r2,:64], ip
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vst1.64         {d4}, [r2,:64], ip
+        vst1.64         {d5}, [r2,:64], ip
+        vst1.64         {d6}, [r2,:64], ip
+        vst1.64         {d7}, [r2,:64], ip
+        vst1.64         {d8}, [r2,:64], ip
+        vst1.64         {d9}, [r2,:64], ip
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_neon(int16_t *data); */
+function ff_simple_idct_neon, export=1
+        idct_start      r0
+
+        mov             r2,  r0
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+
+        idct_end
+endfunc