From 086c044dc34dfc0f74fbe41f4ecb402b2cd34884 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Fri, 19 Apr 2024 03:13:33 +0200
Subject: Merging upstream version 125.0.1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 media/libopus/celt/arm/arm_celt_map.c    |  31 ++-
 media/libopus/celt/arm/armcpu.c          |  51 ++++-
 media/libopus/celt/arm/armcpu.h          |  13 ++
 media/libopus/celt/arm/celt_neon_intr.c  |  83 ++++++-
 media/libopus/celt/arm/pitch_neon_intr.c |   7 +
 media/libopus/celt/celt.h                |  25 +--
 media/libopus/celt/celt_decoder.c        | 360 ++++++++++++++++++++++++-------
 media/libopus/celt/celt_encoder.c        |  68 ++++--
 media/libopus/celt/celt_lpc.c            |  27 ++-
 media/libopus/celt/celt_lpc.h            |   2 +-
 media/libopus/celt/cpu_support.h         |   7 +-
 media/libopus/celt/entdec.c              |  21 ++
 media/libopus/celt/entdec.h              |  10 +
 media/libopus/celt/entenc.c              |  11 +
 media/libopus/celt/entenc.h              |   9 +
 media/libopus/celt/laplace.c             | 101 +++++++++
 media/libopus/celt/laplace.h             |   9 +
 media/libopus/celt/mathops.h             |   6 +
 media/libopus/celt/mips/celt_mipsr1.h    |   6 +-
 media/libopus/celt/mips/mdct_mipsr1.h    |   6 +-
 media/libopus/celt/mips/vq_mipsr1.h      |   6 +-
 media/libopus/celt/os_support.h          |  14 +-
 media/libopus/celt/pitch.c               |  11 +-
 media/libopus/celt/pitch.h               |  11 +
 media/libopus/celt/stack_alloc.h         |   2 +-
 media/libopus/celt/x86/celt_lpc_sse4_1.c |  13 +-
 media/libopus/celt/x86/pitch_avx.c       | 101 +++++++++
 media/libopus/celt/x86/pitch_sse.h       |  40 +++-
 media/libopus/celt/x86/vq_sse.h          |   6 +-
 media/libopus/celt/x86/vq_sse2.c         |   8 +-
 media/libopus/celt/x86/x86_arch_macros.h |  47 ++++
 media/libopus/celt/x86/x86_celt_map.c    |  20 ++
 media/libopus/celt/x86/x86cpu.c          |  16 +-
 media/libopus/celt/x86/x86cpu.h          |  49 +++--
 34 files changed, 1014 insertions(+), 183 deletions(-)
 create mode 100644 media/libopus/celt/x86/pitch_avx.c
 create mode 100644 media/libopus/celt/x86/x86_arch_macros.h

(limited to 'media/libopus/celt')

diff --git a/media/libopus/celt/arm/arm_celt_map.c b/media/libopus/celt/arm/arm_celt_map.c
index ca988b66f5..cbaea49579 100644
--- a/media/libopus/celt/arm/arm_celt_map.c
+++ b/media/libopus/celt/arm/arm_celt_map.c
@@ -40,7 +40,8 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, c
   celt_inner_prod_c,   /* ARMv4 */
   celt_inner_prod_c,   /* EDSP */
   celt_inner_prod_c,   /* Media */
-  celt_inner_prod_neon /* NEON */
+  celt_inner_prod_neon,/* NEON */
+  celt_inner_prod_neon /* DOTPROD */
 };
 
 void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
@@ -48,7 +49,8 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
   dual_inner_prod_c,   /* ARMv4 */
   dual_inner_prod_c,   /* EDSP */
   dual_inner_prod_c,   /* Media */
-  dual_inner_prod_neon /* NEON */
+  dual_inner_prod_neon,/* NEON */
+  dual_inner_prod_neon /* DOTPROD */
 };
 # endif
 
@@ -61,7 +63,8 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
-  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr),  /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* DOTPROD */
 };
 
 #  endif
@@ -72,7 +75,8 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
-  celt_pitch_xcorr_float_neon      /* Neon */
+  celt_pitch_xcorr_float_neon,     /* Neon */
+  celt_pitch_xcorr_float_neon      /* DOTPROD */
 };
 #  endif
 # endif /* FIXED_POINT */
@@ -90,6 +94,7 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
   xcorr_kernel_c,                /* EDSP */
   xcorr_kernel_c,                /* Media */
   xcorr_kernel_neon_fixed,       /* Neon */
+  xcorr_kernel_neon_fixed        /* DOTPROD */
 };
 
 #endif
@@ -101,14 +106,16 @@ int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_alloc_arch_c,        /* ARMv4 */
    opus_fft_alloc_arch_c,        /* EDSP */
    opus_fft_alloc_arch_c,        /* Media */
-   opus_fft_alloc_arm_neon       /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon,      /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon       /* DOTPROD with NE10 library support */
 };
 
 void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
    opus_fft_free_arch_c,         /* ARMv4 */
    opus_fft_free_arch_c,         /* EDSP */
    opus_fft_free_arch_c,         /* Media */
-   opus_fft_free_arm_neon        /* Neon with NE10 */
+   opus_fft_free_arm_neon,       /* Neon with NE10 */
+   opus_fft_free_arm_neon        /* DOTPROD with NE10 */
 };
 #   endif /* CUSTOM_MODES */
 
@@ -118,7 +125,8 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_fft_c,                   /* ARMv4 */
    opus_fft_c,                   /* EDSP */
    opus_fft_c,                   /* Media */
-   opus_fft_neon                 /* Neon with NE10 */
+   opus_fft_neon,                /* Neon with NE10 */
+   opus_fft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
@@ -127,7 +135,8 @@ void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
    opus_ifft_c,                   /* ARMv4 */
    opus_ifft_c,                   /* EDSP */
    opus_ifft_c,                   /* Media */
-   opus_ifft_neon                 /* Neon with NE10 */
+   opus_ifft_neon,                /* Neon with NE10 */
+   opus_ifft_neon                 /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -139,7 +148,8 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_forward_c,           /* ARMv4 */
    clt_mdct_forward_c,           /* EDSP */
    clt_mdct_forward_c,           /* Media */
-   clt_mdct_forward_neon         /* Neon with NE10 */
+   clt_mdct_forward_neon,        /* Neon with NE10 */
+   clt_mdct_forward_neon         /* DOTPROD with NE10 */
 };
 
 void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
@@ -151,7 +161,8 @@ void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
    clt_mdct_backward_c,           /* ARMv4 */
    clt_mdct_backward_c,           /* EDSP */
    clt_mdct_backward_c,           /* Media */
-   clt_mdct_backward_neon         /* Neon with NE10 */
+   clt_mdct_backward_neon,        /* Neon with NE10 */
+   clt_mdct_backward_neon         /* DOTPROD with NE10 */
 };
 
 #  endif /* HAVE_ARM_NE10 */
diff --git a/media/libopus/celt/arm/armcpu.c b/media/libopus/celt/arm/armcpu.c
index c7d16e6d61..06a53435b8 100644
--- a/media/libopus/celt/arm/armcpu.c
+++ b/media/libopus/celt/arm/armcpu.c
@@ -43,6 +43,7 @@
 #define OPUS_CPU_ARM_EDSP_FLAG  (1<<OPUS_ARCH_ARM_EDSP)
 #define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
 #define OPUS_CPU_ARM_NEON_FLAG  (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG  (1<<OPUS_ARCH_ARM_DOTPROD)
 
 #if defined(_MSC_VER)
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -126,6 +127,14 @@ opus_uint32 opus_cpu_capabilities(void)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON_FLAG;
+        p = strstr(buf, " asimd");
+        if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+        p = strstr(buf, " asimddp");
+        if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+          flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
 #  endif
       }
 # endif
@@ -144,10 +153,44 @@ opus_uint32 opus_cpu_capabilities(void)
 # endif
     }
 
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
     fclose(cpuinfo);
   }
   return flags;
 }
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+  size_t size = sizeof(uint32_t);
+  uint32_t value = 0;
+  if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+  {
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+  }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+  flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+  flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+  return flags;
+}
+
 #else
 /* The feature registers which can tell us what the processor supports are
  * accessible in priveleged modes only, so we can't have a general user-space
@@ -180,7 +223,13 @@ static int opus_select_arch_impl(void)
   }
   arch++;
 
-  celt_assert(arch == OPUS_ARCH_ARM_NEON);
+  if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_NEON);
+    return arch;
+  }
+  arch++;
+
+  celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
   return arch;
 }
 
diff --git a/media/libopus/celt/arm/armcpu.h b/media/libopus/celt/arm/armcpu.h
index 820262ff5f..6d5803d81a 100644
--- a/media/libopus/celt/arm/armcpu.h
+++ b/media/libopus/celt/arm/armcpu.h
@@ -46,6 +46,12 @@
 #  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+#  define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+#  define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
 # if defined(OPUS_ARM_PRESUME_EDSP)
 #  define PRESUME_EDSP(name) name ## _edsp
 # else
@@ -64,6 +70,12 @@
 #  define PRESUME_NEON(name) PRESUME_MEDIA(name)
 # endif
 
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+#  define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+#  define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
 # if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);
 
@@ -71,6 +83,7 @@ int opus_select_arch(void);
 #define OPUS_ARCH_ARM_EDSP  (1)
 #define OPUS_ARCH_ARM_MEDIA (2)
 #define OPUS_ARCH_ARM_NEON  (3)
+#define OPUS_ARCH_ARM_DOTPROD  (4)
 
 # endif
 
diff --git a/media/libopus/celt/arm/celt_neon_intr.c b/media/libopus/celt/arm/celt_neon_intr.c
index effda769d0..250f836218 100644
--- a/media/libopus/celt/arm/celt_neon_intr.c
+++ b/media/libopus/celt/arm/celt_neon_intr.c
@@ -38,6 +38,8 @@
 #include "../pitch.h"
 
 #if defined(FIXED_POINT)
+#include <string.h>
+
 void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
@@ -47,7 +49,10 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
    int16x4_t y0 = vld1_s16(y);
    y += 4;
 
-   for (j = 0; j + 8 <= len; j += 8)
+   /* This loop loads one y value more than we actually need.
+      Therefore we have to stop as soon as there are 8 or fewer samples left
+       (instead of 7), to avoid reading past the end of the array. */
+   for (j = 0; j + 8 < len; j += 8)
    {
       /* Load x[0...7] */
       int16x8_t xx = vld1q_s16(x);
@@ -80,23 +85,79 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va
       x += 8;
       y += 8;
    }
-
-   for (; j < len; j++)
-   {
-      int16x4_t x0 = vld1_dup_s16(x);  /* load next x */
+   if (j + 4 < len) {
+      /* Load x[0...3] */
+      int16x4_t x0 = vld1_s16(x);
+      /* Load y[4...7] */
+      int16x4_t y4 = vld1_s16(y);
+      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+      int16x4_t y2 = vext_s16(y0, y4, 2);
+      int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+      int16x4_t y3 = vext_s16(y0, y4, 3);
+      int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+      y0 = y4;
+      a = a3;
+      x += 4;
+      y += 4;
+      j += 4;
+   }
+   if (j + 2 < len) {
+      /* Load x[0...1] */
+      int16x4x2_t xx = vld2_dup_s16(x);
+      int16x4_t x0 = xx.val[0];
+      int16x4_t x1 = xx.val[1];
+      /* Load y[4...5].
+         We would like to use vld1_dup_s32(), but casting the pointer would
+          break strict aliasing rules and potentially have alignment issues.
+         Fortunately the compiler seems capable of translating this memcpy()
+          and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+      int32_t yy;
+      memcpy(&yy, y, sizeof(yy));
+      int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
       int32x4_t a0 = vmlal_s16(a, y0, x0);
-
-      int16x4_t y4 = vld1_dup_s16(y);  /* load next y */
-      y0 = vext_s16(y0, y4, 1);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+          using VSRI instead of VEXT, since it's a data-processing
+          instruction. */
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 32));
+      int32x4_t a1 = vmlal_s16(a0, y1, x1);
+      a = a1;
+      x += 2;
+      y += 2;
+      j += 2;
+   }
+   if (j + 1 < len) {
+      /* Load next x. */
+      int16x4_t x0 = vld1_dup_s16(x);
+      int32x4_t a0 = vmlal_s16(a, y0, x0);
+      /* Load last y. */
+      int16x4_t y4 = vld1_dup_s16(y);
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 16));
       a = a0;
       x++;
-      y++;
    }
-
-   vst1q_s32(sum, a);
+   /* Load last x. */
+   int16x4_t x0 = vld1_dup_s16(x);
+   int32x4_t a0 = vmlal_s16(a, y0, x0);
+   vst1q_s32(sum, a0);
 }
 
 #else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ *    vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
 /*
  * Function: xcorr_kernel_neon_float
  * ---------------------------------
diff --git a/media/libopus/celt/arm/pitch_neon_intr.c b/media/libopus/celt/arm/pitch_neon_intr.c
index 35cc46e2c2..43885f528c 100644
--- a/media/libopus/celt/arm/pitch_neon_intr.c
+++ b/media/libopus/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 
 /* ========================================================================== */
 
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
 #ifdef OPUS_CHECK_ASM
 
 /* This part of code simulates floating-point NEON operations. */
diff --git a/media/libopus/celt/celt.h b/media/libopus/celt/celt.h
index 24b6b2b520..2f501951d5 100644
--- a/media/libopus/celt/celt.h
+++ b/media/libopus/celt/celt.h
@@ -42,6 +42,10 @@
 #include "entdec.h"
 #include "arch.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -149,6 +153,13 @@ int celt_decoder_get_size(int channels);
 
 int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
 
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      );
+
 int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
       int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
 
@@ -225,23 +236,13 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap, int arch);
 
-#ifdef NON_STATIC_COMB_FILTER_CONST_C
-void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
-                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
-#endif
-
-#ifndef OVERRIDE_COMB_FILTER_CONST
-# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
-    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
-#endif
-
 void init_caps(const CELTMode *m,int *cap,int LM,int C);
 
 #ifdef RESYNTH
-void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem);
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum);
 void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
       opus_val16 *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
-      int LM, int downsample, int silence);
+      int LM, int downsample, int silence, int arch);
 #endif
 
 #ifdef __cplusplus
diff --git a/media/libopus/celt/celt_decoder.c b/media/libopus/celt/celt_decoder.c
index 883dae15d2..743c2031bc 100644
--- a/media/libopus/celt/celt_decoder.c
+++ b/media/libopus/celt/celt_decoder.c
@@ -51,6 +51,11 @@
 #include "celt_lpc.h"
 #include "vq.h"
 
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
 /* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
    CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
    current value corresponds to a pitch of 66.67 Hz. */
@@ -59,9 +64,6 @@
    pitch of 480 Hz. */
 #define PLC_PITCH_LAG_MIN (100)
 
-#if defined(SMALL_FOOTPRINT) && defined(FIXED_POINT)
-#define NORM_ALIASING_HACK
-#endif
 /**********************************************************************/
 /*                                                                    */
 /*                             DECODER                                */
@@ -69,6 +71,9 @@
 /**********************************************************************/
 #define DECODE_BUFFER_SIZE 2048
 
+#define PLC_UPDATE_FRAMES 4
+#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE)
+
 /** Decoder state
  @brief Decoder state
  */
@@ -82,6 +87,7 @@ struct OpusCustomDecoder {
    int start, end;
    int signalling;
    int disable_inv;
+   int complexity;
    int arch;
 
    /* Everything beyond this point gets cleared on a reset */
@@ -98,11 +104,18 @@ struct OpusCustomDecoder {
    opus_val16 postfilter_gain_old;
    int postfilter_tapset;
    int postfilter_tapset_old;
+   int prefilter_and_fold;
 
    celt_sig preemph_memD[2];
 
+#ifdef ENABLE_DEEP_PLC
+   opus_int16 plc_pcm[PLC_UPDATE_SAMPLES];
+   int plc_fill;
+   float plc_preemphasis_mem;
+#endif
+
    celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
-   /* opus_val16 lpc[],  Size = channels*LPC_ORDER */
+   /* opus_val16 lpc[],  Size = channels*CELT_LPC_ORDER */
    /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
    /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
    /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
@@ -157,7 +170,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int
 {
    int size = sizeof(struct CELTDecoder)
             + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
-            + channels*LPC_ORDER*sizeof(opus_val16)
+            + channels*CELT_LPC_ORDER*sizeof(opus_val16)
             + 4*2*mode->nbEBands*sizeof(opus_val16);
    return size;
 }
@@ -499,7 +512,100 @@ static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
    return pitch_index;
 }
 
-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
+static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N)
+{
+   int c;
+   int CC;
+   int i;
+   int overlap;
+   celt_sig *decode_mem[2];
+   const OpusCustomMode *mode;
+   VARDECL(opus_val32, etmp);
+   mode = st->mode;
+   overlap = st->overlap;
+   CC = st->channels;
+   ALLOC(etmp, overlap, opus_val32);
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+   } while (++c<CC);
+
+   c=0; do {
+      /* Apply the pre-filter to the MDCT overlap for the next frame because
+         the post-filter will be re-applied in the decoder after the MDCT
+         overlap. */
+      comb_filter(etmp, decode_mem[c]+DECODE_BUFFER_SIZE-N,
+         st->postfilter_period_old, st->postfilter_period, overlap,
+         -st->postfilter_gain_old, -st->postfilter_gain,
+         st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch);
+
+      /* Simulate TDAC on the concealed audio so that it blends with the
+         MDCT of the next frame. */
+      for (i=0;i<overlap/2;i++)
+      {
+         decode_mem[c][DECODE_BUFFER_SIZE-N+i] =
+            MULT16_32_Q15(mode->window[i], etmp[overlap-1-i])
+            + MULT16_32_Q15(mode->window[overlap-i-1], etmp[i]);
+      }
+   } while (++c<CC);
+}
+
+#ifdef ENABLE_DEEP_PLC
+
+#define SINC_ORDER 48
+/* h=cos(pi/2*abs(sin([-24:24]/48*pi*23./24)).^2);
+   b=sinc([-24:24]/3*1.02).*h;
+   b=b/sum(b); */
+static const float sinc_filter[SINC_ORDER+1] = {
+    4.2931e-05f, -0.000190293f, -0.000816132f, -0.000637162f, 0.00141662f, 0.00354764f, 0.00184368f, -0.00428274f,
+    -0.00856105f, -0.0034003f, 0.00930201f, 0.0159616f, 0.00489785f, -0.0169649f, -0.0259484f, -0.00596856f,
+    0.0286551f, 0.0405872f, 0.00649994f, -0.0509284f, -0.0716655f, -0.00665212f,  0.134336f,  0.278927f,
+    0.339995f,  0.278927f,  0.134336f, -0.00665212f, -0.0716655f, -0.0509284f, 0.00649994f, 0.0405872f,
+    0.0286551f, -0.00596856f, -0.0259484f, -0.0169649f, 0.00489785f, 0.0159616f, 0.00930201f, -0.0034003f,
+    -0.00856105f, -0.00428274f, 0.00184368f, 0.00354764f, 0.00141662f, -0.000637162f, -0.000816132f, -0.000190293f,
+    4.2931e-05f
+};
+
+void update_plc_state(LPCNetPLCState *lpcnet, celt_sig *decode_mem[2], float *plc_preemphasis_mem, int CC)
+{
+   int i;
+   int tmp_read_post, tmp_fec_skip;
+   int offset;
+   celt_sig buf48k[DECODE_BUFFER_SIZE];
+   opus_int16 buf16k[PLC_UPDATE_SAMPLES];
+   if (CC == 1) OPUS_COPY(buf48k, decode_mem[0], DECODE_BUFFER_SIZE);
+   else {
+      for (i=0;i<DECODE_BUFFER_SIZE;i++) {
+         buf48k[i] = .5*(decode_mem[0][i] + decode_mem[1][i]);
+      }
+   }
+   /* Down-sample the last 40 ms. */
+   for (i=1;i<DECODE_BUFFER_SIZE;i++) buf48k[i] += PREEMPHASIS*buf48k[i-1];
+   *plc_preemphasis_mem = buf48k[DECODE_BUFFER_SIZE-1];
+   offset = DECODE_BUFFER_SIZE-SINC_ORDER-1 - 3*(PLC_UPDATE_SAMPLES-1);
+   celt_assert(3*(PLC_UPDATE_SAMPLES-1) + SINC_ORDER + offset == DECODE_BUFFER_SIZE-1);
+   for (i=0;i<PLC_UPDATE_SAMPLES;i++) {
+      int j;
+      float sum = 0;
+      for (j=0;j<SINC_ORDER+1;j++) {
+         sum += buf48k[3*i + j + offset]*sinc_filter[j];
+      }
+      buf16k[i] = float2int(MIN32(32767.f, MAX32(-32767.f, sum)));
+   }
+   tmp_read_post = lpcnet->fec_read_pos;
+   tmp_fec_skip = lpcnet->fec_skip;
+   for (i=0;i<PLC_UPDATE_FRAMES;i++) {
+      lpcnet_plc_update(lpcnet, &buf16k[FRAME_SIZE*i]);
+   }
+   lpcnet->fec_read_pos = tmp_read_post;
+   lpcnet->fec_skip = tmp_fec_skip;
+}
+#endif
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
 {
    int c;
    int i;
@@ -527,22 +633,22 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
    } while (++c<C);
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
-   oldBandE = lpc+C*LPC_ORDER;
+   oldBandE = lpc+C*CELT_LPC_ORDER;
    oldLogE = oldBandE + 2*nbEBands;
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
 
    loss_duration = st->loss_duration;
    start = st->start;
+#ifdef ENABLE_DEEP_PLC
+   noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80));
+#else
    noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
+#endif
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
-#ifdef NORM_ALIASING_HACK
-      celt_norm *X;
-#else
       VARDECL(celt_norm, X);
-#endif
       opus_uint32 seed;
       int end;
       int effEnd;
@@ -550,18 +656,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       end = st->end;
       effEnd = IMAX(start, IMIN(end, mode->effEBands));
 
-#ifdef NORM_ALIASING_HACK
-      /* This is an ugly hack that breaks aliasing rules and would be easily broken,
-         but it saves almost 4kB of stack. */
-      X = (celt_norm*)(out_syn[C-1]+overlap/2);
-#else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-#endif
       c=0; do {
          OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
-               DECODE_BUFFER_SIZE-N+(overlap>>1));
+               DECODE_BUFFER_SIZE-N+overlap);
       } while (++c<C);
 
+      if (st->prefilter_and_fold) {
+         prefilter_and_fold(st, N);
+      }
+
       /* Energy decay */
       decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
       c=0; do
@@ -590,6 +694,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       st->rng = seed;
 
       celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
+      st->prefilter_and_fold = 0;
+      /* Skip regular PLC until we get two consecutive packets. */
+      st->skip_plc = 1;
    } else {
       int exc_length;
       /* Pitch-based PLC */
@@ -597,12 +704,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       opus_val16 *exc;
       opus_val16 fade = Q15ONE;
       int pitch_index;
-      VARDECL(opus_val32, etmp);
       VARDECL(opus_val16, _exc);
       VARDECL(opus_val16, fir_tmp);
 
       if (loss_duration == 0)
       {
+#ifdef ENABLE_DEEP_PLC
+        if (lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C);
+#endif
          st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
          pitch_index = st->last_pitch_index;
@@ -613,10 +722,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          decaying signal, but we can't get more than MAX_PERIOD. */
       exc_length = IMIN(2*pitch_index, MAX_PERIOD);
 
-      ALLOC(etmp, overlap, opus_val32);
-      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16);
       ALLOC(fir_tmp, exc_length, opus_val16);
-      exc = _exc+LPC_ORDER;
+      exc = _exc+CELT_LPC_ORDER;
       window = mode->window;
       c=0; do {
          opus_val16 decay;
@@ -628,16 +736,16 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          int j;
 
          buf = decode_mem[c];
-         for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
-            exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+         for (i=0;i<MAX_PERIOD+CELT_LPC_ORDER;i++)
+            exc[i-CELT_LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-CELT_LPC_ORDER+i], SIG_SHIFT);
 
          if (loss_duration == 0)
          {
-            opus_val32 ac[LPC_ORDER+1];
+            opus_val32 ac[CELT_LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
                the first loss so we can work in the excitation-filter domain. */
             _celt_autocorr(exc, ac, window, overlap,
-                   LPC_ORDER, MAX_PERIOD, st->arch);
+                   CELT_LPC_ORDER, MAX_PERIOD, st->arch);
             /* Add a noise floor of -40 dB. */
 #ifdef FIXED_POINT
             ac[0] += SHR32(ac[0],13);
@@ -645,7 +753,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             ac[0] *= 1.0001f;
 #endif
             /* Use lag windowing to stabilize the Levinson-Durbin recursion. */
-            for (i=1;i<=LPC_ORDER;i++)
+            for (i=1;i<=CELT_LPC_ORDER;i++)
             {
                /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
 #ifdef FIXED_POINT
@@ -654,7 +762,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
                ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
 #endif
             }
-            _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
+            _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER);
 #ifdef FIXED_POINT
          /* For fixed-point, apply bandwidth expansion until we can guarantee that
             no overflow can happen in the IIR filter. This means:
@@ -662,13 +770,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          while (1) {
             opus_val16 tmp=Q15ONE;
             opus_val32 sum=QCONST16(1., SIG_SHIFT);
-            for (i=0;i<LPC_ORDER;i++)
-               sum += ABS16(lpc[c*LPC_ORDER+i]);
+            for (i=0;i<CELT_LPC_ORDER;i++)
+               sum += ABS16(lpc[c*CELT_LPC_ORDER+i]);
             if (sum < 65535) break;
-            for (i=0;i<LPC_ORDER;i++)
+            for (i=0;i<CELT_LPC_ORDER;i++)
             {
                tmp = MULT16_16_Q15(QCONST16(.99f,15), tmp);
-               lpc[c*LPC_ORDER+i] = MULT16_16_Q15(lpc[c*LPC_ORDER+i], tmp);
+               lpc[c*CELT_LPC_ORDER+i] = MULT16_16_Q15(lpc[c*CELT_LPC_ORDER+i], tmp);
             }
          }
 #endif
@@ -678,8 +786,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          {
             /* Compute the excitation for exc_length samples before the loss. We need the copy
                because celt_fir() cannot filter in-place. */
-            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  fir_tmp, exc_length, LPC_ORDER, st->arch);
+            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER,
+                  fir_tmp, exc_length, CELT_LPC_ORDER, st->arch);
             OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length);
          }
 
@@ -737,15 +845,15 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             S1 += SHR32(MULT16_16(tmp, tmp), 10);
          }
          {
-            opus_val16 lpc_mem[LPC_ORDER];
+            opus_val16 lpc_mem[CELT_LPC_ORDER];
             /* Copy the last decoded samples (prior to the overlap region) to
                synthesis filter memory so we can have a continuous signal. */
-            for (i=0;i<LPC_ORDER;i++)
+            for (i=0;i<CELT_LPC_ORDER;i++)
                lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
             /* Apply the synthesis filter to convert the excitation back into
                the signal domain. */
-            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
-                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
+            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*CELT_LPC_ORDER,
+                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, CELT_LPC_ORDER,
                   lpc_mem, st->arch);
 #ifdef FIXED_POINT
             for (i=0; i < extrapolation_len; i++)
@@ -792,23 +900,65 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
             }
          }
 
-         /* Apply the pre-filter to the MDCT overlap for the next frame because
-            the post-filter will be re-applied in the decoder after the MDCT
-            overlap. */
-         comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
-              st->postfilter_period, st->postfilter_period, overlap,
-              -st->postfilter_gain, -st->postfilter_gain,
-              st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch);
-
-         /* Simulate TDAC on the concealed audio so that it blends with the
-            MDCT of the next frame. */
-         for (i=0;i<overlap/2;i++)
-         {
-            buf[DECODE_BUFFER_SIZE+i] =
-               MULT16_32_Q15(window[i], etmp[overlap-1-i])
-               + MULT16_32_Q15(window[overlap-i-1], etmp[i]);
-         }
       } while (++c<C);
+
+#ifdef ENABLE_DEEP_PLC
+      if (lpcnet->loaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) {
+         float overlap_mem;
+         int samples_needed16k;
+         celt_sig *buf;
+         VARDECL(float, buf_copy);
+         buf = decode_mem[0];
+         ALLOC(buf_copy, C*overlap, float);
+         c=0; do {
+            OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap);
+         } while (++c<C);
+
+         /* Need enough samples from the PLC to cover the frame size, resampling delay,
+            and the overlap at the end. */
+         samples_needed16k = (N+SINC_ORDER+overlap)/3;
+         if (loss_duration == 0) {
+            st->plc_fill = 0;
+         }
+         while (st->plc_fill < samples_needed16k) {
+            lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]);
+            st->plc_fill += FRAME_SIZE;
+         }
+         /* Resample to 48 kHz. */
+         for (i=0;i<(N+overlap)/3;i++) {
+            int j;
+            float sum;
+            for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j];
+            buf[DECODE_BUFFER_SIZE-N+3*i] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2];
+            buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1];
+            buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum;
+         }
+         OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3);
+         st->plc_fill -= N/3;
+         for (i=0;i<N;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE-N+i];
+            buf[DECODE_BUFFER_SIZE-N+i] -= PREEMPHASIS*st->plc_preemphasis_mem;
+            st->plc_preemphasis_mem = tmp;
+         }
+         overlap_mem = st->plc_preemphasis_mem;
+         for (i=0;i<overlap;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE+i];
+            buf[DECODE_BUFFER_SIZE+i] -= PREEMPHASIS*overlap_mem;
+            overlap_mem = tmp;
+         }
+         /* For now, we just do mono PLC. */
+         if (C==2) OPUS_COPY(decode_mem[1], decode_mem[0], DECODE_BUFFER_SIZE+overlap);
+         c=0; do {
+            /* Cross-fade with 48-kHz non-neural PLC for the first 2.5 ms to avoid a discontinuity. */
+            if (loss_duration == 0) {
+               for (i=0;i<overlap;i++) decode_mem[c][DECODE_BUFFER_SIZE-N+i] = (1-window[i])*buf_copy[c*overlap+i] + (window[i])*decode_mem[c][DECODE_BUFFER_SIZE-N+i];
+            }
+         } while (++c<C);
+      }
+#endif
+      st->prefilter_and_fold = 1;
    }
 
    /* Saturate to soemthing large to avoid wrap-around. */
@@ -817,18 +967,18 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    RESTORE_STACK;
 }
 
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
-      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
 {
    int c, i, N;
    int spread_decision;
    opus_int32 bits;
    ec_dec _dec;
-#ifdef NORM_ALIASING_HACK
-   celt_norm *X;
-#else
    VARDECL(celt_norm, X);
-#endif
    VARDECL(int, fine_quant);
    VARDECL(int, pulses);
    VARDECL(int, cap);
@@ -881,7 +1031,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    frame_size *= st->downsample;
 
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
-   oldBandE = lpc+CC*LPC_ORDER;
+   oldBandE = lpc+CC*CELT_LPC_ORDER;
    oldLogE = oldBandE + 2*nbEBands;
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
@@ -935,15 +1085,25 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    if (data == NULL || len<=1)
    {
-      celt_decode_lost(st, N, LM);
+      celt_decode_lost(st, N, LM
+#ifdef ENABLE_DEEP_PLC
+      , lpcnet
+#endif
+                      );
       deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
       RESTORE_STACK;
       return frame_size/st->downsample;
    }
+#ifdef ENABLE_DEEP_PLC
+   else {
+      /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */
+      if (lpcnet) lpcnet->blend = 0;
+   }
+#endif
 
    /* Check if there are at least two packets received consecutively before
     * turning on the pitch-based PLC */
-   st->skip_plc = st->loss_duration != 0;
+   if (st->loss_duration == 0) st->skip_plc = 0;
 
    if (dec == NULL)
    {
@@ -1006,6 +1166,36 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    /* Decode the global flags (first symbols in the stream) */
    intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+   /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the
+      risk of getting loud artifacts. */
+   if (!intra_ener && st->loss_duration != 0) {
+      c=0; do
+      {
+         opus_val16 safety = 0;
+         int missing = IMIN(10, st->loss_duration>>LM);
+         if (LM==0) safety = QCONST16(1.5f,DB_SHIFT);
+         else if (LM==1) safety = QCONST16(.5f,DB_SHIFT);
+         for (i=start;i<end;i++)
+         {
+            if (oldBandE[c*nbEBands+i] < MAX16(oldLogE[c*nbEBands+i], oldLogE2[c*nbEBands+i])) {
+               /* If energy is going down already, continue the trend. */
+               opus_val32 slope;
+               opus_val32 E0, E1, E2;
+               E0 = oldBandE[c*nbEBands+i];
+               E1 = oldLogE[c*nbEBands+i];
+               E2 = oldLogE2[c*nbEBands+i];
+               slope = MAX32(E1 - E0, HALF32(E2 - E0));
+               E0 -= MAX32(0, (1+missing)*slope);
+               oldBandE[c*nbEBands+i] = MAX32(-QCONST16(20.f,DB_SHIFT), E0);
+            } else {
+               /* Otherwise take the min of the last frames. */
+               oldBandE[c*nbEBands+i] = MIN16(MIN16(oldBandE[c*nbEBands+i], oldLogE[c*nbEBands+i]), oldLogE2[c*nbEBands+i]);
+            }
+            /* Shorter frames have more natural fluctuations -- play it safe. */
+            oldBandE[c*nbEBands+i] -= safety;
+         }
+      } while (++c<2);
+   }
    /* Get band energies */
    unquant_coarse_energy(mode, start, end, oldBandE,
          intra_ener, dec, C, LM);
@@ -1073,19 +1263,13 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
 
    c=0; do {
-      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
    } while (++c<CC);
 
    /* Decode fixed codebook */
    ALLOC(collapse_masks, C*nbEBands, unsigned char);
 
-#ifdef NORM_ALIASING_HACK
-   /* This is an ugly hack that breaks aliasing rules and would be easily broken,
-      but it saves almost 4kB of stack. */
-   X = (celt_norm*)(out_syn[CC-1]+overlap/2);
-#else
    ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-#endif
 
    quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
          NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
@@ -1109,7 +1293,9 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
       for (i=0;i<C*nbEBands;i++)
          oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
    }
-
+   if (st->prefilter_and_fold) {
+      prefilter_and_fold(st, N);
+   }
    celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd,
                   C, CC, isTransient, LM, st->downsample, silence, st->arch);
 
@@ -1173,6 +1359,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
    st->loss_duration = 0;
+   st->prefilter_and_fold = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
       return OPUS_INTERNAL_ERROR;
@@ -1181,6 +1368,15 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    return frame_size/st->downsample;
 }
 
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+{
+   return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum
+#ifdef ENABLE_DEEP_PLC
+       , NULL
+#endif
+       );
+}
 
 #ifdef CUSTOM_MODES
 
@@ -1254,6 +1450,26 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
    va_start(ap, request);
    switch (request)
    {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if(value<0 || value>10)
+          {
+             goto bad_arg;
+          }
+          st->complexity = value;
+      }
+      break;
+      case OPUS_GET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          if (!value)
+          {
+             goto bad_arg;
+          }
+          *value = st->complexity;
+      }
+      break;
       case CELT_SET_START_BAND_REQUEST:
       {
          opus_int32 value = va_arg(ap, opus_int32);
@@ -1300,7 +1516,7 @@ int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
          int i;
          opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
          lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
-         oldBandE = lpc+st->channels*LPC_ORDER;
+         oldBandE = lpc+st->channels*CELT_LPC_ORDER;
          oldLogE = oldBandE + 2*st->mode->nbEBands;
          oldLogE2 = oldLogE + 2*st->mode->nbEBands;
          OPUS_CLEAR((char*)&st->DECODER_RESET_START,
diff --git a/media/libopus/celt/celt_encoder.c b/media/libopus/celt/celt_encoder.c
index 637d442cf7..7f32a801c6 100644
--- a/media/libopus/celt/celt_encoder.c
+++ b/media/libopus/celt/celt_encoder.c
@@ -281,6 +281,9 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
       /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
       for (i=0;i<len;i++)
       {
+#ifndef FIXED_POINT
+         float mem00;
+#endif
          opus_val32 x,y;
          x = SHR32(in[i+c*len],SIG_SHIFT);
          y = ADD32(mem0, x);
@@ -288,8 +291,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
          mem0 = mem1 + y - SHL32(x,1);
          mem1 = x - SHR32(y,1);
 #else
+         /* Original code:
          mem0 = mem1 + y - 2*x;
          mem1 = x - .5f*y;
+         Modified code to shorten dependency chains: */
+         mem00=mem0;
+         mem0 = mem0 - x + .5f*mem1;
+         mem1 =  x - mem00;
 #endif
          tmp[i] = SROUND16(y, 2);
          /*printf("%f ", tmp[i]);*/
@@ -322,10 +330,11 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
 #ifdef FIXED_POINT
          /* FIXME: Use PSHR16() instead */
          tmp[i] = mem0 + PSHR32(x2-mem0,forward_shift);
+         mem0 = tmp[i];
 #else
-         tmp[i] = mem0 + MULT16_16_P15(forward_decay,x2-mem0);
+         mem0 = x2 + (1.f-forward_decay)*mem0;
+         tmp[i] = forward_decay*mem0;
 #endif
-         mem0 = tmp[i];
       }
 
       mem0=0;
@@ -337,11 +346,13 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
 #ifdef FIXED_POINT
          /* FIXME: Use PSHR16() instead */
          tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
-#else
-         tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0);
-#endif
          mem0 = tmp[i];
          maxE = MAX16(maxE, mem0);
+#else
+         mem0 = tmp[i] + 0.875f*mem0;
+         tmp[i] = 0.125f*mem0;
+         maxE = MAX16(maxE, 0.125f*mem0);
+#endif
       }
       /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
 
@@ -967,7 +978,7 @@ static opus_val16 median_of_3(const opus_val16 *x)
       return t0;
 }
 
-static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
+static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2, const opus_val16 *oldBandE,
       int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
       int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
       int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc,
@@ -978,9 +989,11 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
    opus_val16 maxDepth;
    VARDECL(opus_val16, follower);
    VARDECL(opus_val16, noise_floor);
+   VARDECL(opus_val16, bandLogE3);
    SAVE_STACK;
    ALLOC(follower, C*nbEBands, opus_val16);
    ALLOC(noise_floor, C*nbEBands, opus_val16);
+   ALLOC(bandLogE3, nbEBands, opus_val16);
    OPUS_CLEAR(offsets, nbEBands);
    /* Dynamic allocation code */
    maxDepth=-QCONST16(31.9f, DB_SHIFT);
@@ -1033,8 +1046,10 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
          printf("%d ", spread_weight[i]);
       printf("\n");*/
    }
-   /* Make sure that dynamic allocation can't make us bust the budget */
-   if (effectiveBytes > 50 && LM>=1 && !lfe)
+   /* Make sure that dynamic allocation can't make us bust the budget.
+      We enable the feature starting at 24 kb/s for 20-ms frames
+      and 96 kb/s for 2.5 ms frames.  */
+   if (effectiveBytes >= (30 + 5*LM) && !lfe)
    {
       int last=0;
       c=0;do
@@ -1042,30 +1057,38 @@ static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16
          opus_val16 offset;
          opus_val16 tmp;
          opus_val16 *f;
+         OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end);
+         if (LM==0) {
+            /* For 2.5 ms frames, the first 8 bands have just one bin, so the
+               energy is highly unreliable (high variance). For that reason,
+               we take the max with the previous energy so that at least 2 bins
+               are getting used. */
+            for (i=0;i<IMIN(8,end);i++) bandLogE3[i] = MAX16(bandLogE2[c*nbEBands+i], oldBandE[c*nbEBands+i]);
+         }
          f = &follower[c*nbEBands];
-         f[0] = bandLogE2[c*nbEBands];
+         f[0] = bandLogE3[0];
          for (i=1;i<end;i++)
          {
             /* The last band to be at least 3 dB higher than the previous one
                is the last we'll consider. Otherwise, we run into problems on
                bandlimited signals. */
-            if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
+            if (bandLogE3[i] > bandLogE3[i-1]+QCONST16(.5f,DB_SHIFT))
                last=i;
-            f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+            f[i] = MIN16(f[i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE3[i]);
          }
          for (i=last-1;i>=0;i--)
-            f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+            f[i] = MIN16(f[i], MIN16(f[i+1]+QCONST16(2.f,DB_SHIFT), bandLogE3[i]));
 
          /* Combine with a median filter to avoid dynalloc triggering unnecessarily.
             The "offset" value controls how conservative we are -- a higher offset
             reduces the impact of the median filter and makes dynalloc use more bits. */
          offset = QCONST16(1.f, DB_SHIFT);
          for (i=2;i<end-2;i++)
-            f[i] = MAX16(f[i], median_of_5(&bandLogE2[c*nbEBands+i-2])-offset);
-         tmp = median_of_3(&bandLogE2[c*nbEBands])-offset;
+            f[i] = MAX16(f[i], median_of_5(&bandLogE3[i-2])-offset);
+         tmp = median_of_3(&bandLogE3[0])-offset;
          f[0] = MAX16(f[0], tmp);
          f[1] = MAX16(f[1], tmp);
-         tmp = median_of_3(&bandLogE2[c*nbEBands+end-3])-offset;
+         tmp = median_of_3(&bandLogE3[end-3])-offset;
          f[end-2] = MAX16(f[end-2], tmp);
          f[end-1] = MAX16(f[end-1], tmp);
 
@@ -1565,10 +1588,13 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       vbr_rate = 0;
       tmp = st->bitrate*frame_size;
       if (tell>1)
-         tmp += tell;
+         tmp += tell*mode->Fs;
       if (st->bitrate!=OPUS_BITRATE_MAX)
+      {
          nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
                (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+         ec_enc_shrink(enc, nbCompressedBytes);
+      }
       effectiveBytes = nbCompressedBytes - nbFilledBytes;
    }
    equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50);
@@ -1882,7 +1908,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    ALLOC(importance, nbEBands, int);
    ALLOC(spread_weight, nbEBands, int);
 
-   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, start, end, C, offsets,
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets,
          st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
          eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight);
 
@@ -2246,7 +2272,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       if (anti_collapse_on)
       {
          anti_collapse(mode, X, collapse_masks, LM, C, N,
-               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
       }
 
       c=0; do {
@@ -2265,15 +2291,15 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
          comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
                st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
-               mode->window, overlap);
+               mode->window, overlap, st->arch);
          if (LM!=0)
             comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
                   st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
-                  mode->window, overlap);
+                  mode->window, overlap, st->arch);
       } while (++c<CC);
 
       /* We reuse freq[] as scratch space for the de-emphasis */
-      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD);
+      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0);
       st->prefilter_period_old = st->prefilter_period;
       st->prefilter_gain_old = st->prefilter_gain;
       st->prefilter_tapset_old = st->prefilter_tapset;
diff --git a/media/libopus/celt/celt_lpc.c b/media/libopus/celt/celt_lpc.c
index f91721bcab..fabca65cb3 100644
--- a/media/libopus/celt/celt_lpc.c
+++ b/media/libopus/celt/celt_lpc.c
@@ -44,7 +44,7 @@ int          p
    opus_val32 r;
    opus_val32 error = ac[0];
 #ifdef FIXED_POINT
-   opus_val32 lpc[LPC_ORDER];
+   opus_val32 lpc[CELT_LPC_ORDER];
 #else
    float *lpc = _lpc;
 #endif
@@ -158,7 +158,17 @@ void celt_fir_c(
       sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT);
       sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
       sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
-      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rnum, x+i-ord, sum_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
       y[i  ] = SROUND16(sum[0], SIG_SHIFT);
       y[i+1] = SROUND16(sum[1], SIG_SHIFT);
       y[i+2] = SROUND16(sum[2], SIG_SHIFT);
@@ -222,8 +232,17 @@ void celt_iir(const opus_val32 *_x,
       sum[1]=_x[i+1];
       sum[2]=_x[i+2];
       sum[3]=_x[i+3];
-      xcorr_kernel(rden, y+i, sum, ord, arch);
-
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rden, y+i, sum_c, ord);
+#endif
+         xcorr_kernel(rden, y+i, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
       /* Patch up the result to compensate for the fact that this is an IIR */
       y[i+ord  ] = -SROUND16(sum[0],SIG_SHIFT);
       _y[i  ] = sum[0];
diff --git a/media/libopus/celt/celt_lpc.h b/media/libopus/celt/celt_lpc.h
index a4c5fd6ea5..97dee82f02 100644
--- a/media/libopus/celt/celt_lpc.h
+++ b/media/libopus/celt/celt_lpc.h
@@ -35,7 +35,7 @@
 #include "x86/celt_lpc_sse.h"
 #endif
 
-#define LPC_ORDER 24
+#define CELT_LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
 
diff --git a/media/libopus/celt/cpu_support.h b/media/libopus/celt/cpu_support.h
index 7b5c56ca90..9f13d8aecf 100644
--- a/media/libopus/celt/cpu_support.h
+++ b/media/libopus/celt/cpu_support.h
@@ -35,19 +35,20 @@
   (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 #include "arm/armcpu.h"
 
-/* We currently support 4 ARM variants:
+/* We currently support 5 ARM variants:
  * arch[0] -> ARMv4
  * arch[1] -> ARMv5E
  * arch[2] -> ARMv6
  * arch[3] -> NEON
+ * arch[4] -> NEON+DOTPROD
  */
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
 
 #elif defined(OPUS_HAVE_RTCD) && \
   ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
 #include "x86/x86cpu.h"
 /* We currently support 5 x86 variants:
diff --git a/media/libopus/celt/entdec.c b/media/libopus/celt/entdec.c
index 0b3433ed8b..027aa24bca 100644
--- a/media/libopus/celt/entdec.c
+++ b/media/libopus/celt/entdec.c
@@ -195,6 +195,27 @@ int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb){
   return ret;
 }
 
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  opus_uint32 d;
+  opus_uint32 s;
+  opus_uint32 t;
+  int         ret;
+  s=_this->rng;
+  d=_this->val;
+  r=s>>_ftb;
+  ret=-1;
+  do{
+    t=s;
+    s=IMUL32(r,_icdf[++ret]);
+  }
+  while(d<s);
+  _this->val=d-s;
+  _this->rng=t-s;
+  ec_dec_normalize(_this);
+  return ret;
+}
+
 opus_uint32 ec_dec_uint(ec_dec *_this,opus_uint32 _ft){
   unsigned ft;
   unsigned s;
diff --git a/media/libopus/celt/entdec.h b/media/libopus/celt/entdec.h
index 025fc1870d..c81f26fdb2 100644
--- a/media/libopus/celt/entdec.h
+++ b/media/libopus/celt/entdec.h
@@ -81,6 +81,16 @@ int ec_dec_bit_logp(ec_dec *_this,unsigned _logp);
   Return: The decoded symbol s.*/
 int ec_dec_icdf(ec_dec *_this,const unsigned char *_icdf,unsigned _ftb);
 
+/*Decodes a symbol given an "inverse" CDF table.
+  No call to ec_dec_update() is necessary after this call.
+  _icdf: The "inverse" CDF, such that symbol s falls in the range
+          [s>0?ft-_icdf[s-1]:0,ft-_icdf[s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.
+  Return: The decoded symbol s.*/
+int ec_dec_icdf16(ec_dec *_this,const opus_uint16 *_icdf,unsigned _ftb);
+
 /*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
   The bits must have been encoded with ec_enc_uint().
   No call to ec_dec_update() is necessary after this call.
diff --git a/media/libopus/celt/entenc.c b/media/libopus/celt/entenc.c
index f1750d25b8..69c6f835d0 100644
--- a/media/libopus/celt/entenc.c
+++ b/media/libopus/celt/entenc.c
@@ -172,6 +172,17 @@ void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb){
   ec_enc_normalize(_this);
 }
 
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb){
+  opus_uint32 r;
+  r=_this->rng>>_ftb;
+  if(_s>0){
+    _this->val+=_this->rng-IMUL32(r,_icdf[_s-1]);
+    _this->rng=IMUL32(r,_icdf[_s-1]-_icdf[_s]);
+  }
+  else _this->rng-=IMUL32(r,_icdf[_s]);
+  ec_enc_normalize(_this);
+}
+
 void ec_enc_uint(ec_enc *_this,opus_uint32 _fl,opus_uint32 _ft){
   unsigned  ft;
   unsigned  fl;
diff --git a/media/libopus/celt/entenc.h b/media/libopus/celt/entenc.h
index f502eaf662..010874bbc1 100644
--- a/media/libopus/celt/entenc.h
+++ b/media/libopus/celt/entenc.h
@@ -64,6 +64,15 @@ void ec_enc_bit_logp(ec_enc *_this,int _val,unsigned _logp);
   _ftb: The number of bits of precision in the cumulative distribution.*/
 void ec_enc_icdf(ec_enc *_this,int _s,const unsigned char *_icdf,unsigned _ftb);
 
+/*Encodes a symbol given an "inverse" CDF table.
+  _s:    The index of the symbol to encode.
+  _icdf: The "inverse" CDF, such that symbol _s falls in the range
+          [_s>0?ft-_icdf[_s-1]:0,ft-_icdf[_s]), where ft=1<<_ftb.
+         The values must be monotonically non-increasing, and the last value
+          must be 0.
+  _ftb: The number of bits of precision in the cumulative distribution.*/
+void ec_enc_icdf16(ec_enc *_this,int _s,const opus_uint16 *_icdf,unsigned _ftb);
+
 /*Encodes a raw unsigned integer in the stream.
   _fl: The integer to encode.
   _ft: The number of integers that can be encoded (one more than the max).
diff --git a/media/libopus/celt/laplace.c b/media/libopus/celt/laplace.c
index a7bca874b6..2180966662 100644
--- a/media/libopus/celt/laplace.c
+++ b/media/libopus/celt/laplace.c
@@ -132,3 +132,104 @@ int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay)
    ec_dec_update(dec, fl, IMIN(fl+fs,32768), 32768);
    return val;
 }
+
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay)
+{
+   int s;
+   opus_uint16 sign_icdf[3];
+   sign_icdf[0] = 32768-p0;
+   sign_icdf[1] = sign_icdf[0]/2;
+   sign_icdf[2] = 0;
+   s = value == 0 ? 0 : (value > 0 ? 1 : 2);
+   ec_enc_icdf16(enc, s, sign_icdf, 15);
+   value = abs(value);
+   if (value)
+   {
+      int i;
+      opus_uint16 icdf[8];
+      icdf[0] = IMAX(7, decay);
+      for (i=1;i<7;i++)
+      {
+         icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+      }
+      icdf[7] = 0;
+      value--;
+      do {
+         ec_enc_icdf16(enc, IMIN(value, 7), icdf, 15);
+         value -= 7;
+      } while (value >= 0);
+   }
+}
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay)
+{
+   int s;
+   int value;
+   opus_uint16 sign_icdf[3];
+   sign_icdf[0] = 32768-p0;
+   sign_icdf[1] = sign_icdf[0]/2;
+   sign_icdf[2] = 0;
+   s = ec_dec_icdf16(dec, sign_icdf, 15);
+   if (s==2) s = -1;
+   if (s != 0)
+   {
+      int i;
+      int v;
+      opus_uint16 icdf[8];
+      icdf[0] = IMAX(7, decay);
+      for (i=1;i<7;i++)
+      {
+         icdf[i] = IMAX(7-i, (icdf[i-1] * (opus_int32)decay) >> 15);
+      }
+      icdf[7] = 0;
+      value = 1;
+      do {
+         v = ec_dec_icdf16(dec, icdf, 15);
+         value += v;
+      } while (v == 7);
+      return s*value;
+   } else return 0;
+}
+
+#if 0
+
+#include <stdio.h>
+#define NB_VALS 10
+#define DATA_SIZE 10000
+int main() {
+   ec_enc enc;
+   ec_dec dec;
+   unsigned char *ptr;
+   int i;
+   int decay, p0;
+   int val[NB_VALS] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+   /*for (i=0;i<NB_VALS;i++) {
+      val[i] = -log(rand()/(float)RAND_MAX);
+      if (rand()%2) val[i] = -val[i];
+   }*/
+   p0 = 16000;
+   decay = 16000;
+   ptr = (unsigned char *)malloc(DATA_SIZE);
+   ec_enc_init(&enc,ptr,DATA_SIZE);
+   for (i=0;i<NB_VALS;i++) {
+      printf("%d ", val[i]);
+   }
+   printf("\n");
+   for (i=0;i<NB_VALS;i++) {
+      ec_laplace_encode_p0(&enc, val[i], p0, decay);
+   }
+
+   ec_enc_done(&enc);
+
+   ec_dec_init(&dec,ec_get_buffer(&enc),ec_range_bytes(&enc));
+
+   for (i=0;i<NB_VALS;i++) {
+      val[i] = ec_laplace_decode_p0(&dec, p0, decay);
+   }
+   for (i=0;i<NB_VALS;i++) {
+      printf("%d ", val[i]);
+   }
+   printf("\n");
+}
+
+#endif
diff --git a/media/libopus/celt/laplace.h b/media/libopus/celt/laplace.h
index 46c14b5da5..8010ad9755 100644
--- a/media/libopus/celt/laplace.h
+++ b/media/libopus/celt/laplace.h
@@ -26,6 +26,9 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#ifndef LAPLACE_H
+#define LAPLACE_H
+
 #include "entenc.h"
 #include "entdec.h"
 
@@ -46,3 +49,9 @@ void ec_laplace_encode(ec_enc *enc, int *value, unsigned fs, int decay);
  @return Value decoded
  */
 int ec_laplace_decode(ec_dec *dec, unsigned fs, int decay);
+
+
+int ec_laplace_decode_p0(ec_dec *dec, opus_uint16 p0, opus_uint16 decay);
+void ec_laplace_encode_p0(ec_enc *enc, int value, opus_uint16 p0, opus_uint16 decay);
+
+#endif
diff --git a/media/libopus/celt/mathops.h b/media/libopus/celt/mathops.h
index 478ac9187c..e2eece2937 100644
--- a/media/libopus/celt/mathops.h
+++ b/media/libopus/celt/mathops.h
@@ -230,6 +230,12 @@ static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x)
    frac = SHL16(x, 4);
    return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
 }
+
+#undef D0
+#undef D1
+#undef D2
+#undef D3
+
 /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
 static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x)
 {
diff --git a/media/libopus/celt/mips/celt_mipsr1.h b/media/libopus/celt/mips/celt_mipsr1.h
index c332fe0471..d1b25c204d 100644
--- a/media/libopus/celt/mips/celt_mipsr1.h
+++ b/media/libopus/celt/mips/celt_mipsr1.h
@@ -27,8 +27,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef __CELT_MIPSR1_H__
-#define __CELT_MIPSR1_H__
+#ifndef CELT_MIPSR1_H__
+#define CELT_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -149,4 +149,4 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
    }
 }
 
-#endif /* __CELT_MIPSR1_H__ */
+#endif /* CELT_MIPSR1_H__ */
diff --git a/media/libopus/celt/mips/mdct_mipsr1.h b/media/libopus/celt/mips/mdct_mipsr1.h
index 2934dab776..7456c181a5 100644
--- a/media/libopus/celt/mips/mdct_mipsr1.h
+++ b/media/libopus/celt/mips/mdct_mipsr1.h
@@ -38,8 +38,8 @@
    MDCT implementation in FFMPEG, but has differences in signs, ordering
    and scaling in many places.
 */
-#ifndef __MDCT_MIPSR1_H__
-#define __MDCT_MIPSR1_H__
+#ifndef MDCT_MIPSR1_H__
+#define MDCT_MIPSR1_H__
 
 #ifndef SKIP_CONFIG_H
 #ifdef HAVE_CONFIG_H
@@ -285,4 +285,4 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       }
    }
 }
-#endif /* __MDCT_MIPSR1_H__ */
+#endif /* MDCT_MIPSR1_H__ */
diff --git a/media/libopus/celt/mips/vq_mipsr1.h b/media/libopus/celt/mips/vq_mipsr1.h
index f26a33e755..1621c5624f 100644
--- a/media/libopus/celt/mips/vq_mipsr1.h
+++ b/media/libopus/celt/mips/vq_mipsr1.h
@@ -26,8 +26,8 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#ifndef __VQ_MIPSR1_H__
-#define __VQ_MIPSR1_H__
+#ifndef VQ_MIPSR1_H__
+#define VQ_MIPSR1_H__
 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
@@ -113,4 +113,4 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
    /*return celt_sqrt(E);*/
 }
 
-#endif /* __VQ_MIPSR1_H__ */
+#endif /* VQ_MIPSR1_H__ */
diff --git a/media/libopus/celt/os_support.h b/media/libopus/celt/os_support.h
index 009bf861da..7d2d378116 100644
--- a/media/libopus/celt/os_support.h
+++ b/media/libopus/celt/os_support.h
@@ -41,7 +41,7 @@
 #include <string.h>
 #include <stdlib.h>
 
-/** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */
+/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
 #ifndef OVERRIDE_OPUS_ALLOC
 static OPUS_INLINE void *opus_alloc (size_t size)
 {
@@ -49,7 +49,15 @@ static OPUS_INLINE void *opus_alloc (size_t size)
 }
 #endif
 
-/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
+#ifndef OVERRIDE_OPUS_REALLOC
+static OPUS_INLINE void *opus_realloc (void *ptr, size_t size)
+{
+   return realloc(ptr, size);
+}
+#endif
+
+/** Used only for non-threadsafe pseudostack.
+    If desired, this can always return the same area of memory rather than allocating a new one every time. */
 #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
 static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 {
@@ -58,7 +66,7 @@ static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 }
 #endif
 
-/** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */
+/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
 #ifndef OVERRIDE_OPUS_FREE
 static OPUS_INLINE void opus_free (void *ptr)
 {
diff --git a/media/libopus/celt/pitch.c b/media/libopus/celt/pitch.c
index 7998db4164..e33c60a3bf 100644
--- a/media/libopus/celt/pitch.c
+++ b/media/libopus/celt/pitch.c
@@ -262,7 +262,16 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4]={0,0,0,0};
+         xcorr_kernel_c(_x, _y+i, sum_c, len);
+#endif
+         xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
       xcorr[i]=sum[0];
       xcorr[i+1]=sum[1];
       xcorr[i+2]=sum[2];
diff --git a/media/libopus/celt/pitch.h b/media/libopus/celt/pitch.h
index e425f56aea..dd0e2bebd2 100644
--- a/media/libopus/celt/pitch.h
+++ b/media/libopus/celt/pitch.h
@@ -189,4 +189,15 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
 # define celt_pitch_xcorr celt_pitch_xcorr_c
 #endif
 
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
+
+
 #endif
diff --git a/media/libopus/celt/stack_alloc.h b/media/libopus/celt/stack_alloc.h
index ae40e2a165..e2739bdf66 100644
--- a/media/libopus/celt/stack_alloc.h
+++ b/media/libopus/celt/stack_alloc.h
@@ -141,7 +141,7 @@ extern char *global_stack_top;
 #else
 
 #define ALIGN(stack, size) ((stack) += ((size) - (long)(stack)) & ((size) - 1))
-#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/sizeof(char)),(stack)+=(size)*(sizeof(type)/sizeof(char)),(type*)((stack)-(size)*(sizeof(type)/sizeof(char))))
+#define PUSH(stack, size, type) (ALIGN((stack),sizeof(type)/(sizeof(char))),(stack)+=(size)*(sizeof(type)/(sizeof(char))),(type*)((stack)-(size)*(sizeof(type)/(sizeof(char)))))
 #if 0 /* Set this to 1 to instrument pseudostack usage */
 #define RESTORE_STACK (printf("%ld %s:%d\n", global_stack-scratch_ptr, __FILE__, __LINE__),global_stack = _saved_stack)
 #else
diff --git a/media/libopus/celt/x86/celt_lpc_sse4_1.c b/media/libopus/celt/x86/celt_lpc_sse4_1.c
index 5478568849..daf59d245a 100644
--- a/media/libopus/celt/x86/celt_lpc_sse4_1.c
+++ b/media/libopus/celt/x86/celt_lpc_sse4_1.c
@@ -64,9 +64,16 @@ void celt_fir_sse4_1(const opus_val16 *x,
    {
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
-
-      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
-
+#if defined(OPUS_CHECK_ASM)
+      {
+         opus_val32 sums_c[4] = {0};
+         xcorr_kernel_c(rnum, x+i-ord, sums_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
+#if defined(OPUS_CHECK_ASM)
+         celt_assert(memcmp(sums, sums_c, sizeof(sums)) == 0);
+      }
+#endif
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
diff --git a/media/libopus/celt/x86/pitch_avx.c b/media/libopus/celt/x86/pitch_avx.c
new file mode 100644
index 0000000000..f731762d84
--- /dev/null
+++ b/media/libopus/celt/x86/pitch_avx.c
@@ -0,0 +1,101 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include <immintrin.h>
+#include "x86cpu.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(FIXED_POINT)
+
+/* Like the "regular" xcorr_kernel(), but computes 8 results at a time. */
+static void xcorr_kernel_avx(const float *x, const float *y, float sum[8], int len)
+{
+    __m256 xsum0, xsum1, xsum2, xsum3, xsum4, xsum5, xsum6, xsum7;
+    xsum7 = xsum6 = xsum5 = xsum4 = xsum3 = xsum2 = xsum1 = xsum0 = _mm256_setzero_ps();
+    int i;
+    __m256 x0;
+    /* Compute 8 inner products using partial sums. */
+    for (i=0;i<len-7;i+=8)
+    {
+        x0 = _mm256_loadu_ps(x+i);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i  ), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+1), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+2), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+3), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+4), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+5), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+6), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_loadu_ps(y+i+7), xsum7);
+    }
+    if (i != len) {
+        static const int mask[15] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
+        __m256i m;
+        m = _mm256_loadu_si256((__m256i*)(void*)(mask + 7+i-len));
+        x0 = _mm256_maskload_ps(x+i, m);
+        xsum0 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i  , m), xsum0);
+        xsum1 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+1, m), xsum1);
+        xsum2 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+2, m), xsum2);
+        xsum3 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+3, m), xsum3);
+        xsum4 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+4, m), xsum4);
+        xsum5 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+5, m), xsum5);
+        xsum6 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+6, m), xsum6);
+        xsum7 = _mm256_fmadd_ps(x0, _mm256_maskload_ps(y+i+7, m), xsum7);
+    }
+    /* 8 horizontal adds. */
+    /* Compute [0 4] [1 5] [2 6] [3 7] */
+    xsum0 = _mm256_add_ps(_mm256_permute2f128_ps(xsum0, xsum4, 2<<4), _mm256_permute2f128_ps(xsum0, xsum4, 1 | (3<<4)));
+    xsum1 = _mm256_add_ps(_mm256_permute2f128_ps(xsum1, xsum5, 2<<4), _mm256_permute2f128_ps(xsum1, xsum5, 1 | (3<<4)));
+    xsum2 = _mm256_add_ps(_mm256_permute2f128_ps(xsum2, xsum6, 2<<4), _mm256_permute2f128_ps(xsum2, xsum6, 1 | (3<<4)));
+    xsum3 = _mm256_add_ps(_mm256_permute2f128_ps(xsum3, xsum7, 2<<4), _mm256_permute2f128_ps(xsum3, xsum7, 1 | (3<<4)));
+    /* Compute [0 1 4 5] [2 3 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    xsum1 = _mm256_hadd_ps(xsum2, xsum3);
+    /* Compute [0 1 2 3 4 5 6 7] */
+    xsum0 = _mm256_hadd_ps(xsum0, xsum1);
+    _mm256_storeu_ps(sum, xsum0);
+}
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch)
+{
+   int i;
+   celt_assert(max_pitch>0);
+   (void)arch;
+   for (i=0;i<max_pitch-7;i+=8)
+   {
+      xcorr_kernel_avx(_x, _y+i, &xcorr[i], len);
+   }
+   for (;i<max_pitch;i++)
+   {
+      xcorr[i] = celt_inner_prod(_x, _y+i, len, arch);
+   }
+}
+
+#endif
diff --git a/media/libopus/celt/x86/pitch_sse.h b/media/libopus/celt/x86/pitch_sse.h
index 964aef50db..127581f3e1 100644
--- a/media/libopus/celt/x86/pitch_sse.h
+++ b/media/libopus/celt/x86/pitch_sse.h
@@ -131,12 +131,6 @@ extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 
-#define OVERRIDE_DUAL_INNER_PROD
-#define OVERRIDE_COMB_FILTER_CONST
-
-#undef dual_inner_prod
-#undef comb_filter_const
-
 void dual_inner_prod_sse(const opus_val16 *x,
     const opus_val16 *y01,
     const opus_val16 *y02,
@@ -154,13 +148,17 @@ void comb_filter_const_sse(opus_val32 *y,
 
 
 #if defined(OPUS_X86_PRESUME_SSE)
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 # define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
     ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
 
 # define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
     ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
 extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
               const opus_val16 *x,
               const opus_val16 *y01,
@@ -187,6 +185,32 @@ extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
 #define NON_STATIC_COMB_FILTER_CONST_C
 
 #endif
-#endif
+
+void celt_pitch_xcorr_avx2(const float *_x, const float *_y, float *xcorr, int len, int max_pitch, int arch);
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_avx2
+
+#elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_PITCH_XCORR
+extern void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+              const float *_x,
+              const float *_y,
+              float *xcorr,
+              int len,
+              int max_pitch,
+              int arch
+              );
+
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+    ((*PITCH_XCORR_IMPL[(arch) & OPUS_ARCHMASK])(_x, _y, xcorr, len, max_pitch, arch))
+
+
+#endif /* OPUS_X86_PRESUME_AVX2 && !OPUS_HAVE_RTCD */
+
+#endif /* OPUS_X86_MAY_HAVE_SSE && !FIXED_POINT */
 
 #endif
diff --git a/media/libopus/celt/x86/vq_sse.h b/media/libopus/celt/x86/vq_sse.h
index b4efe8f249..444503b630 100644
--- a/media/libopus/celt/x86/vq_sse.h
+++ b/media/libopus/celt/x86/vq_sse.h
@@ -28,16 +28,18 @@
 #define VQ_SSE_H
 
 #if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
-#define OVERRIDE_OP_PVQ_SEARCH
 
 opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE2)
+
+#define OVERRIDE_OP_PVQ_SEARCH
 #define op_pvq_search(x, iy, K, N, arch) \
     (op_pvq_search_sse2(x, iy, K, N, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
+#define OVERRIDE_OP_PVQ_SEARCH
 extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])(
       celt_norm *_X, int *iy, int K, int N, int arch);
 
diff --git a/media/libopus/celt/x86/vq_sse2.c b/media/libopus/celt/x86/vq_sse2.c
index 775042860d..4c4ebf8e2d 100644
--- a/media/libopus/celt/x86/vq_sse2.c
+++ b/media/libopus/celt/x86/vq_sse2.c
@@ -75,7 +75,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
       sums = _mm_add_ps(sums, x4);
       /* Clear y and iy in case we don't do the projection. */
       _mm_storeu_ps(&y[j], _mm_setzero_ps());
-      _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], _mm_setzero_si128());
       _mm_storeu_ps(&X[j], x4);
       _mm_storeu_ps(&signy[j], s4);
    }
@@ -116,7 +116,7 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
          rx4 = _mm_mul_ps(x4, rcp4);
          iy4 = _mm_cvttps_epi32(rx4);
          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
-         _mm_storeu_si128((__m128i*)&iy[j], iy4);
+         _mm_storeu_si128((__m128i*)(void*)&iy[j], iy4);
          y4 = _mm_cvtepi32_ps(iy4);
          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
@@ -205,10 +205,10 @@ opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
    {
       __m128i y4;
       __m128i s4;
-      y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+      y4 = _mm_loadu_si128((__m128i*)(void*)&iy[j]);
       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
-      _mm_storeu_si128((__m128i*)&iy[j], y4);
+      _mm_storeu_si128((__m128i*)(void*)&iy[j], y4);
    }
    RESTORE_STACK;
    return yy;
diff --git a/media/libopus/celt/x86/x86_arch_macros.h b/media/libopus/celt/x86/x86_arch_macros.h
new file mode 100644
index 0000000000..975b443e93
--- /dev/null
+++ b/media/libopus/celt/x86/x86_arch_macros.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef _MSC_VER
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+#  ifndef __SSE__
+#   define __SSE__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE2
+#  ifndef __SSE2__
+#   define __SSE2__
+#  endif
+# endif
+
+# ifdef OPUS_X86_MAY_HAVE_SSE4_1
+#  ifndef __SSE4_1__
+#   define __SSE4_1__
+#  endif
+# endif
+
+#endif
diff --git a/media/libopus/celt/x86/x86_celt_map.c b/media/libopus/celt/x86/x86_celt_map.c
index d39d88edec..ba8eafe6ad 100644
--- a/media/libopus/celt/x86/x86_celt_map.c
+++ b/media/libopus/celt/x86/x86_celt_map.c
@@ -90,6 +90,26 @@ opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
 
 # else
 
+#if defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)
+
+void (*const PITCH_XCORR_IMPL[OPUS_ARCHMASK + 1])(
+         const float *_x,
+         const float *_y,
+         float *xcorr,
+         int len,
+         int max_pitch,
+         int arch
+) = {
+  celt_pitch_xcorr_c,                /* non-sse */
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  celt_pitch_xcorr_c,
+  MAY_HAVE_AVX2(celt_pitch_xcorr)
+};
+
+#endif
+
+
 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
 
 void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
diff --git a/media/libopus/celt/x86/x86cpu.c b/media/libopus/celt/x86/x86cpu.c
index 6a1914dee7..2e7c32aeec 100644
--- a/media/libopus/celt/x86/x86cpu.c
+++ b/media/libopus/celt/x86/x86cpu.c
@@ -39,7 +39,7 @@
   ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 
 #if defined(_MSC_VER)
 
@@ -105,7 +105,7 @@ typedef struct CPU_Feature{
     int HW_SSE2;
     int HW_SSE41;
     /*  SIMD: 256-bit */
-    int HW_AVX;
+    int HW_AVX2;
 } CPU_Feature;
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
@@ -121,13 +121,19 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
         cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
         cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
         cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
-        cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+        cpu_feature->HW_AVX2 = (info[2] & (1 << 28)) != 0 && (info[2] & (1 << 12)) != 0;
+        if (cpu_feature->HW_AVX2 && nIds >= 7) {
+            cpuid(info, 7);
+            cpu_feature->HW_AVX2 = cpu_feature->HW_AVX2 && (info[1] & (1 << 5)) != 0;
+        } else {
+            cpu_feature->HW_AVX2 = 0;
+        }
     }
     else {
         cpu_feature->HW_SSE = 0;
         cpu_feature->HW_SSE2 = 0;
         cpu_feature->HW_SSE41 = 0;
-        cpu_feature->HW_AVX = 0;
+        cpu_feature->HW_AVX2 = 0;
     }
 }
 
@@ -157,7 +163,7 @@ static int opus_select_arch_impl(void)
     }
     arch++;
 
-    if (!cpu_feature.HW_AVX)
+    if (!cpu_feature.HW_AVX2)
     {
         return arch;
     }
diff --git a/media/libopus/celt/x86/x86cpu.h b/media/libopus/celt/x86/x86cpu.h
index 04e80489b1..8ae9be8d8f 100644
--- a/media/libopus/celt/x86/x86cpu.h
+++ b/media/libopus/celt/x86/x86cpu.h
@@ -46,28 +46,53 @@
 #  define MAY_HAVE_SSE4_1(name) name ## _c
 # endif
 
-# if defined(OPUS_X86_MAY_HAVE_AVX)
-#  define MAY_HAVE_AVX(name) name ## _avx
+# if defined(OPUS_X86_MAY_HAVE_AVX2)
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
-#  define MAY_HAVE_AVX(name) name ## _c
+#  define MAY_HAVE_AVX2(name) name ## _c
 # endif
 
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
 int opus_select_arch(void);
 # endif
 
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+#  include "opus_defines.h"
+
 /*MOVD should not impose any alignment restrictions, but the C standard does,
    and UBSan will report errors if we actually make unaligned accesses.
   Use this to work around those restrictions (which should hopefully all get
-   optimized to a single MOVD instruction).*/
-#define OP_LOADU_EPI32(x) \
-  (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
-   *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+   optimized to a single MOVD instruction).
+  GCC implemented _mm_loadu_si32() since GCC 11; HOWEVER, there is a bug!
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99754 */
+#  if !defined(_MSC_VER) && !OPUS_GNUC_PREREQ(11,3) && !(defined(__clang__) && (__clang_major__ >= 8))
+#   include <string.h>
+#   include <emmintrin.h>
+
+#   ifdef _mm_loadu_si32
+#    undef _mm_loadu_si32
+#   endif
+#   define _mm_loadu_si32 WORKAROUND_mm_loadu_si32
+static inline __m128i WORKAROUND_mm_loadu_si32(void const* mem_addr) {
+  int val;
+  memcpy(&val, mem_addr, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+#  elif defined(_MSC_VER)
+    /* MSVC needs this for _mm_loadu_si32 */
+#   include <immintrin.h>
+#  endif
 
-#define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+#  define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_loadu_si32(x)))
 
-#define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+#  define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(void*)(x))))
+
+# endif
 
 #endif
-- 
cgit v1.2.3