summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h72
1 files changed, 39 insertions, 33 deletions
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 74f85a6bb6..c989a6721b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -524,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
*a7 = vreinterpretq_s32_s64(c3.val[1]);
}
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
uint8x8_t *a6, uint8x8_t *a7) {
- // Swap 8 bit elements. Goes from:
+ // Widen to 128-bit registers (usually a no-op once inlined.)
+ const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+ const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+ const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+ const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+ const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+ const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+ const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+ const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+ // Zip 8 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
@@ -539,43 +547,41 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
- // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
- // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
- // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
- // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
-
- const uint8x16x2_t b0 =
- vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
- const uint8x16x2_t b1 =
- vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
-
- // Swap 16 bit elements resulting in:
- // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
- // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
- // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
- // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
-
- const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
- vreinterpretq_u16_u8(b1.val[0]));
- const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
- vreinterpretq_u16_u8(b1.val[1]));
-
- // Unzip 32 bit elements resulting in:
+ // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+ const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+ const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+ const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
+
+ // Zip 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77
+ const uint16x8x2_t c0 =
+ vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+ const uint16x8x2_t c1 =
+ vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
+
+ // Zip 32 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
// d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
vreinterpretq_u32_u16(c1.val[0]));
- const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
vreinterpretq_u32_u16(c1.val[1]));
*a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
*a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
- *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
- *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
- *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
- *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
*a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
*a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}