summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
blob: 3ca7921b66c1425b8a9542de538c4ca204431106 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;

;

%include "third_party/x86inc/x86inc.asm"

SECTION_RODATA
pw_64:    times 8 dw 64
even_byte_mask: times 8 dw 0x00ff

; %define USE_PMULHRSW
; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
; when using this instruction.
;
; The add order below (based on ffav1) must be followed to prevent outranges.
; x = k0k1 + k4k5
; y = k2k3 + k6k7
; z = signed SAT(x + y)

SECTION .text
%define LOCAL_VARS_SIZE 16*6

%macro SETUP_LOCAL_VARS 0
    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
    ; pmaddubsw has a higher latency on some platforms, this might be eased by
    ; interleaving the instructions.
    %define    k0k1  [rsp + 16*0]
    %define    k2k3  [rsp + 16*1]
    %define    k4k5  [rsp + 16*2]
    %define    k6k7  [rsp + 16*3]
    packsswb     m4, m4
    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
    ; some platforms.
    pshuflw      m0, m4, 0b              ;k0_k1
    pshuflw      m1, m4, 01010101b       ;k2_k3
    pshuflw      m2, m4, 10101010b       ;k4_k5
    pshuflw      m3, m4, 11111111b       ;k6_k7
    punpcklqdq   m0, m0
    punpcklqdq   m1, m1
    punpcklqdq   m2, m2
    punpcklqdq   m3, m3
    mova       k0k1, m0
    mova       k2k3, m1
    mova       k4k5, m2
    mova       k6k7, m3
%if ARCH_X86_64
    %define     krd  m12
    %define    tmp0  [rsp + 16*4]
    %define    tmp1  [rsp + 16*5]
    mova        krd, [GLOBAL(pw_64)]
%else
    %define     krd  [rsp + 16*4]
%if CONFIG_PIC=0
    mova         m6, [GLOBAL(pw_64)]
%else
    ; build constants without accessing global memory
    pcmpeqb      m6, m6                  ;all ones
    psrlw        m6, 15
    psllw        m6, 6                   ;aka pw_64
%endif
    mova        krd, m6
%endif
%endm

;-------------------------------------------------------------------------------
%if ARCH_X86_64
  %define LOCAL_VARS_SIZE_H4 0
%else
  %define LOCAL_VARS_SIZE_H4 16*4
%endif

%macro SUBPIX_HFILTER4 1
cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
                            src, sstride, dst, dstride, height, filter
    mova                m4, [filterq]
    packsswb            m4, m4
%if ARCH_X86_64
    %define       k0k1k4k5  m8
    %define       k2k3k6k7  m9
    %define            krd  m10
    mova               krd, [GLOBAL(pw_64)]
    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
%else
    %define       k0k1k4k5  [rsp + 16*0]
    %define       k2k3k6k7  [rsp + 16*1]
    %define            krd  [rsp + 16*2]
    pshuflw             m6, m4, 0b              ;k0_k1
    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
    pshuflw             m7, m4, 01010101b       ;k2_k3
    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
%if CONFIG_PIC=0
    mova                m1, [GLOBAL(pw_64)]
%else
    ; build constants without accessing global memory
    pcmpeqb             m1, m1                  ;all ones
    psrlw               m1, 15
    psllw               m1, 6                   ;aka pw_64
%endif
    mova          k0k1k4k5, m6
    mova          k2k3k6k7, m7
    mova               krd, m1
%endif
    dec            heightd

.loop:
    ;Do two rows at once
    movu                m4, [srcq - 3]
    movu                m5, [srcq + sstrideq - 3]
    punpckhbw           m1, m4, m4
    punpcklbw           m4, m4
    punpckhbw           m3, m5, m5
    punpcklbw           m5, m5
    palignr             m0, m1, m4, 1
    pmaddubsw           m0, k0k1k4k5
    palignr             m1, m4, 5
    pmaddubsw           m1, k2k3k6k7
    palignr             m2, m3, m5, 1
    pmaddubsw           m2, k0k1k4k5
    palignr             m3, m5, 5
    pmaddubsw           m3, k2k3k6k7
    punpckhqdq          m4, m0, m2
    punpcklqdq          m0, m2
    punpckhqdq          m5, m1, m3
    punpcklqdq          m1, m3
    paddsw              m0, m4
    paddsw              m1, m5
%ifidn %1, h8_avg
    movd                m4, [dstq]
    movd                m5, [dstq + dstrideq]
%endif
    paddsw              m0, m1
    paddsw              m0, krd
    psraw               m0, 7
%ifidn %1, h8_add_src
    pxor                 m3, m3
    movu                 m4, [srcq]
    movu                 m5, [srcq + sstrideq]
    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
    punpcklbw            m4, m3
    paddsw               m0, m4
%endif
    packuswb            m0, m0
    psrldq              m1, m0, 4

%ifidn %1, h8_avg
    pavgb               m0, m4
    pavgb               m1, m5
%endif
    movd            [dstq], m0
    movd [dstq + dstrideq], m1

    lea               srcq, [srcq + sstrideq        ]
    prefetcht0              [srcq + 4 * sstrideq - 3]
    lea               srcq, [srcq + sstrideq        ]
    lea               dstq, [dstq + 2 * dstrideq    ]
    prefetcht0              [srcq + 2 * sstrideq - 3]

    sub            heightd, 2
    jg               .loop

    ; Do last row if output_height is odd
    jne              .done

    movu                m4, [srcq - 3]
    punpckhbw           m1, m4, m4
    punpcklbw           m4, m4
    palignr             m0, m1, m4, 1
    palignr             m1, m4, 5
    pmaddubsw           m0, k0k1k4k5
    pmaddubsw           m1, k2k3k6k7
    psrldq              m2, m0, 8
    psrldq              m3, m1, 8
    paddsw              m0, m2
    paddsw              m1, m3
    paddsw              m0, m1
    paddsw              m0, krd
    psraw               m0, 7
%ifidn %1, h8_add_src
    pxor                m3, m3
    movu                m4, [srcq]
    punpcklbw           m4, m3
    paddsw              m0, m4
%endif
    packuswb            m0, m0
%ifidn %1, h8_avg
    movd                m4, [dstq]
    pavgb               m0, m4
%endif
    movd            [dstq], m0
.done:
    REP_RET
%endm

;-------------------------------------------------------------------------------
%macro SUBPIX_HFILTER8 1
cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                            src, sstride, dst, dstride, height, filter
    mova                 m4, [filterq]
    SETUP_LOCAL_VARS
    dec             heightd

.loop:
    ;Do two rows at once
    movu                 m0, [srcq - 3]
    movu                 m4, [srcq + sstrideq - 3]
    punpckhbw            m1, m0, m0
    punpcklbw            m0, m0
    palignr              m5, m1, m0, 13
    pmaddubsw            m5, k6k7
    palignr              m2, m1, m0, 5
    palignr              m3, m1, m0, 9
    palignr              m1, m0, 1
    pmaddubsw            m1, k0k1
    punpckhbw            m6, m4, m4
    punpcklbw            m4, m4
    pmaddubsw            m2, k2k3
    pmaddubsw            m3, k4k5

    palignr              m7, m6, m4, 13
    palignr              m0, m6, m4, 5
    pmaddubsw            m7, k6k7
    paddsw               m1, m3
    paddsw               m2, m5
    paddsw               m1, m2
%ifidn %1, h8_avg
    movh                 m2, [dstq]
    movhps               m2, [dstq + dstrideq]
%endif
    palignr              m5, m6, m4, 9
    palignr              m6, m4, 1
    pmaddubsw            m0, k2k3
    pmaddubsw            m6, k0k1
    paddsw               m1, krd
    pmaddubsw            m5, k4k5
    psraw                m1, 7
    paddsw               m0, m7
    paddsw               m6, m5
    paddsw               m6, m0
    paddsw               m6, krd
    psraw                m6, 7
%ifidn %1, h8_add_src
    pxor                 m3, m3
    movu                 m4, [srcq]
    movu                 m5, [srcq + sstrideq]
    punpcklbw            m4, m3
    punpcklbw            m5, m3
    paddsw               m1, m4
    paddsw               m6, m5
%endif
    packuswb             m1, m6
%ifidn %1, h8_avg
    pavgb                m1, m2
%endif
    movh              [dstq], m1
    movhps [dstq + dstrideq], m1

    lea                srcq, [srcq + sstrideq        ]
    prefetcht0               [srcq + 4 * sstrideq - 3]
    lea                srcq, [srcq + sstrideq        ]
    lea                dstq, [dstq + 2 * dstrideq    ]
    prefetcht0               [srcq + 2 * sstrideq - 3]
    sub             heightd, 2
    jg                .loop

    ; Do last row if output_height is odd
    jne               .done

    movu                 m0, [srcq - 3]
    punpckhbw            m3, m0, m0
    punpcklbw            m0, m0
    palignr              m1, m3, m0, 1
    palignr              m2, m3, m0, 5
    palignr              m4, m3, m0, 13
    palignr              m3, m0, 9
    pmaddubsw            m1, k0k1
    pmaddubsw            m2, k2k3
    pmaddubsw            m3, k4k5
    pmaddubsw            m4, k6k7
    paddsw               m1, m3
    paddsw               m4, m2
    paddsw               m1, m4
    paddsw               m1, krd
    psraw                m1, 7
%ifidn %1, h8_add_src
    pxor                 m6, m6
    movu                 m5, [srcq]
    punpcklbw            m5, m6
    paddsw               m1, m5
%endif
    packuswb             m1, m1
%ifidn %1, h8_avg
    movh                 m0, [dstq]
    pavgb                m1, m0
%endif
    movh             [dstq], m1
.done:
    REP_RET
%endm

;-------------------------------------------------------------------------------
%macro SUBPIX_HFILTER16 1
cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
    mova          m4, [filterq]
    SETUP_LOCAL_VARS

.loop:
    prefetcht0        [srcq + 2 * sstrideq -3]

    movu          m0, [srcq - 3]
    movu          m4, [srcq - 2]
    pmaddubsw     m0, k0k1
    pmaddubsw     m4, k0k1
    movu          m1, [srcq - 1]
    movu          m5, [srcq + 0]
    pmaddubsw     m1, k2k3
    pmaddubsw     m5, k2k3
    movu          m2, [srcq + 1]
    movu          m6, [srcq + 2]
    pmaddubsw     m2, k4k5
    pmaddubsw     m6, k4k5
    movu          m3, [srcq + 3]
    movu          m7, [srcq + 4]
    pmaddubsw     m3, k6k7
    pmaddubsw     m7, k6k7
    paddsw        m0, m2
    paddsw        m1, m3
    paddsw        m0, m1
    paddsw        m4, m6
    paddsw        m5, m7
    paddsw        m4, m5
    paddsw        m0, krd
    paddsw        m4, krd
    psraw         m0, 7
    psraw         m4, 7
%ifidn %1, h8_add_src
%if ARCH_X86=1 && CONFIG_PIC=1
    pcmpeqb       m2, m2                  ;all ones
    psrlw         m2, 8                   ;even_byte_mask
%else
    mova          m2, [GLOBAL(even_byte_mask)]
%endif
    movu          m5, [srcq]
    mova          m7, m5
    pand          m5, m2
    psrlw         m7, 8
    paddsw        m0, m5
    paddsw        m4, m7
%endif
    packuswb      m0, m0
    packuswb      m4, m4
    punpcklbw     m0, m4
%ifidn %1, h8_avg
    pavgb         m0, [dstq]
%endif
    lea         srcq, [srcq + sstrideq]
    mova      [dstq], m0
    lea         dstq, [dstq + dstrideq]
    dec      heightd
    jnz        .loop
    REP_RET
%endm

INIT_XMM ssse3
SUBPIX_HFILTER16 h8
SUBPIX_HFILTER8  h8
SUBPIX_HFILTER4  h8

;-------------------------------------------------------------------------------

; TODO(Linfeng): Detect cpu type and choose the code with better performance.
%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1

%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
    %define NUM_GENERAL_REG_USED 9
%else
    %define NUM_GENERAL_REG_USED 6
%endif

%macro SUBPIX_VFILTER 2
cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
    mova          m4, [filterq]
    SETUP_LOCAL_VARS

%ifidn %2, 8
    %define                movx  movh
%else
    %define                movx  movd
%endif

    dec                 heightd

%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON

%if ARCH_X86_64
    %define               src1q  r7
    %define           sstride6q  r8
    %define          dst_stride  dstrideq
%else
    %define               src1q  filterq
    %define           sstride6q  dstrideq
    %define          dst_stride  dstridemp
%endif
    mov                   src1q, srcq
    add                   src1q, sstrideq
    lea               sstride6q, [sstrideq + sstrideq * 4]
    add               sstride6q, sstrideq                   ;pitch * 6

.loop:
    ;Do two rows at once
    movx                     m0, [srcq                ]     ;A
    movx                     m1, [src1q               ]     ;B
    punpcklbw                m0, m1                         ;A B
    movx                     m2, [srcq + sstrideq * 2 ]     ;C
    pmaddubsw                m0, k0k1
    mova                     m6, m2
    movx                     m3, [src1q + sstrideq * 2]     ;D
    punpcklbw                m2, m3                         ;C D
    pmaddubsw                m2, k2k3
    movx                     m4, [srcq + sstrideq * 4 ]     ;E
    mova                     m7, m4
    movx                     m5, [src1q + sstrideq * 4]     ;F
    punpcklbw                m4, m5                         ;E F
    pmaddubsw                m4, k4k5
    punpcklbw                m1, m6                         ;A B next iter
    movx                     m6, [srcq + sstride6q    ]     ;G
    punpcklbw                m5, m6                         ;E F next iter
    punpcklbw                m3, m7                         ;C D next iter
    pmaddubsw                m5, k4k5
    movx                     m7, [src1q + sstride6q   ]     ;H
    punpcklbw                m6, m7                         ;G H
    pmaddubsw                m6, k6k7
    pmaddubsw                m3, k2k3
    pmaddubsw                m1, k0k1
    paddsw                   m0, m4
    paddsw                   m2, m6
    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
    punpcklbw                m7, m6
    pmaddubsw                m7, k6k7
    paddsw                   m0, m2
    paddsw                   m0, krd
    psraw                    m0, 7
    paddsw                   m1, m5
%ifidn %1, v8_add_src
    pxor                     m6, m6
    movu                     m4, [srcq]
    punpcklbw                m4, m6
    paddsw                   m0, m4
%endif
    packuswb                 m0, m0

    paddsw                   m3, m7
    paddsw                   m1, m3
    paddsw                   m1, krd
    psraw                    m1, 7
%ifidn %1, v8_add_src
    movu                     m4, [src1q]
    punpcklbw                m4, m6
    paddsw                   m1, m4
%endif
    lea                    srcq, [srcq + sstrideq * 2 ]
    lea                   src1q, [src1q + sstrideq * 2]
    packuswb                 m1, m1

%ifidn %1, v8_avg
    movx                     m2, [dstq]
    pavgb                    m0, m2
%endif
    movx                 [dstq], m0
    add                    dstq, dst_stride
%ifidn %1, v8_avg
    movx                     m3, [dstq]
    pavgb                    m1, m3
%endif
    movx                 [dstq], m1
    add                    dstq, dst_stride
    sub                 heightd, 2
    jg                    .loop

    ; Do last row if output_height is odd
    jne                   .done

    movx                     m0, [srcq                ]     ;A
    movx                     m1, [srcq + sstrideq     ]     ;B
    movx                     m6, [srcq + sstride6q    ]     ;G
    punpcklbw                m0, m1                         ;A B
    movx                     m7, [src1q + sstride6q   ]     ;H
    pmaddubsw                m0, k0k1
    movx                     m2, [srcq + sstrideq * 2 ]     ;C
    punpcklbw                m6, m7                         ;G H
    movx                     m3, [src1q + sstrideq * 2]     ;D
    pmaddubsw                m6, k6k7
    movx                     m4, [srcq + sstrideq * 4 ]     ;E
    punpcklbw                m2, m3                         ;C D
    movx                     m5, [src1q + sstrideq * 4]     ;F
    punpcklbw                m4, m5                         ;E F
    pmaddubsw                m2, k2k3
    pmaddubsw                m4, k4k5
    paddsw                   m2, m6
    paddsw                   m0, m4
    paddsw                   m0, m2
    paddsw                   m0, krd
    psraw                    m0, 7
%ifidn %1, v8_add_src
    pxor                     m6, m6
    movu                     m4, [srcq]
    punpcklbw                m4, m6
    paddsw                   m0, m4
%endif
    packuswb                 m0, m0
%ifidn %1, v8_avg
    movx                     m1, [dstq]
    pavgb                    m0, m1
%endif
    movx                 [dstq], m0

%else
    ; ARCH_X86_64

    movx                     m0, [srcq                ]     ;A
    movx                     m1, [srcq + sstrideq     ]     ;B
    lea                    srcq, [srcq + sstrideq * 2 ]
    movx                     m2, [srcq]                     ;C
    movx                     m3, [srcq + sstrideq]          ;D
    lea                    srcq, [srcq + sstrideq * 2 ]
    movx                     m4, [srcq]                     ;E
    movx                     m5, [srcq + sstrideq]          ;F
    lea                    srcq, [srcq + sstrideq * 2 ]
    movx                     m6, [srcq]                     ;G
    punpcklbw                m0, m1                         ;A B
    punpcklbw                m1, m2                         ;A B next iter
    punpcklbw                m2, m3                         ;C D
    punpcklbw                m3, m4                         ;C D next iter
    punpcklbw                m4, m5                         ;E F
    punpcklbw                m5, m6                         ;E F next iter

.loop:
    ;Do two rows at once
    movx                     m7, [srcq + sstrideq]          ;H
    lea                    srcq, [srcq + sstrideq * 2 ]
    movx                    m14, [srcq]                     ;H next iter
    punpcklbw                m6, m7                         ;G H
    punpcklbw                m7, m14                        ;G H next iter
    pmaddubsw                m8, m0, k0k1
    pmaddubsw                m9, m1, k0k1
    mova                     m0, m2
    mova                     m1, m3
    pmaddubsw               m10, m2, k2k3
    pmaddubsw               m11, m3, k2k3
    mova                     m2, m4
    mova                     m3, m5
    pmaddubsw                m4, k4k5
    pmaddubsw                m5, k4k5
    paddsw                   m8, m4
    paddsw                   m9, m5
    mova                     m4, m6
    mova                     m5, m7
    pmaddubsw                m6, k6k7
    pmaddubsw                m7, k6k7
    paddsw                  m10, m6
    paddsw                  m11, m7
    paddsw                   m8, m10
    paddsw                   m9, m11
    mova                     m6, m14
    paddsw                   m8, krd
    paddsw                   m9, krd
    psraw                    m8, 7
    psraw                    m9, 7
%ifidn %2, 4
    packuswb                 m8, m8
    packuswb                 m9, m9
%else
    packuswb                 m8, m9
%endif

%ifidn %1, v8_avg
    movx                     m7, [dstq]
%ifidn %2, 4
    movx                    m10, [dstq + dstrideq]
    pavgb                    m9, m10
%else
    movhpd                   m7, [dstq + dstrideq]
%endif
    pavgb                    m8, m7
%endif
    movx                 [dstq], m8
%ifidn %2, 4
    movx      [dstq + dstrideq], m9
%else
    movhpd    [dstq + dstrideq], m8
%endif

    lea                    dstq, [dstq + dstrideq * 2 ]
    sub                 heightd, 2
    jg                    .loop

    ; Do last row if output_height is odd
    jne                   .done

    movx                     m7, [srcq + sstrideq]          ;H
    punpcklbw                m6, m7                         ;G H
    pmaddubsw                m0, k0k1
    pmaddubsw                m2, k2k3
    pmaddubsw                m4, k4k5
    pmaddubsw                m6, k6k7
    paddsw                   m0, m4
    paddsw                   m2, m6
    paddsw                   m0, m2
    paddsw                   m0, krd
    psraw                    m0, 7
    packuswb                 m0, m0
%ifidn %1, v8_avg
    movx                     m1, [dstq]
    pavgb                    m0, m1
%endif
    movx                 [dstq], m0

%endif ; ARCH_X86_64

.done:
    REP_RET

%endm

;-------------------------------------------------------------------------------
%macro SUBPIX_VFILTER16 1
cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
    mova                     m4, [filterq]
    SETUP_LOCAL_VARS

%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON

%if ARCH_X86_64
    %define               src1q  r7
    %define           sstride6q  r8
    %define          dst_stride  dstrideq
%else
    %define               src1q  filterq
    %define           sstride6q  dstrideq
    %define          dst_stride  dstridemp
%endif
    lea                   src1q, [srcq + sstrideq]
    lea               sstride6q, [sstrideq + sstrideq * 4]
    add               sstride6q, sstrideq                   ;pitch * 6

.loop:
    movh                     m0, [srcq                ]     ;A
    movh                     m1, [src1q               ]     ;B
    movh                     m2, [srcq + sstrideq * 2 ]     ;C
    movh                     m3, [src1q + sstrideq * 2]     ;D
    movh                     m4, [srcq + sstrideq * 4 ]     ;E
    movh                     m5, [src1q + sstrideq * 4]     ;F

    punpcklbw                m0, m1                         ;A B
    movh                     m6, [srcq + sstride6q]         ;G
    punpcklbw                m2, m3                         ;C D
    movh                     m7, [src1q + sstride6q]        ;H
    punpcklbw                m4, m5                         ;E F
    pmaddubsw                m0, k0k1
    movh                     m3, [srcq + 8]                 ;A
    pmaddubsw                m2, k2k3
    punpcklbw                m6, m7                         ;G H
    movh                     m5, [srcq + sstrideq + 8]      ;B
    pmaddubsw                m4, k4k5
    punpcklbw                m3, m5                         ;A B
    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
    pmaddubsw                m6, k6k7
    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
    punpcklbw                m7, m5                         ;C D
    paddsw                   m2, m6
    pmaddubsw                m3, k0k1
    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
    paddsw                   m0, m4
    pmaddubsw                m7, k2k3
    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
    punpcklbw                m1, m6                         ;E F
    paddsw                   m0, m2
    paddsw                   m0, krd
    movh                     m2, [srcq + sstride6q + 8]     ;G
    pmaddubsw                m1, k4k5
    movh                     m5, [src1q + sstride6q + 8]    ;H
    psraw                    m0, 7
    punpcklbw                m2, m5                         ;G H
    pmaddubsw                m2, k6k7
    paddsw                   m7, m2
    paddsw                   m3, m1
    paddsw                   m3, m7
    paddsw                   m3, krd
    psraw                    m3, 7
%ifidn %1, v8_add_src
    pxor                     m6, m6
    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
    mova                     m5, m4
    punpcklbw                m4, m6
    punpckhbw                m5, m6
    paddsw                   m0, m4
    paddsw                   m3, m5
%endif
    packuswb                 m0, m3

    add                    srcq, sstrideq
    add                   src1q, sstrideq
%ifidn %1, v8_avg
    pavgb                    m0, [dstq]
%endif
    mova                 [dstq], m0
    add                    dstq, dst_stride
    dec                 heightd
    jnz                   .loop
    REP_RET

%else
    ; ARCH_X86_64
    dec                 heightd

    movu                     m1, [srcq                ]     ;A
    movu                     m3, [srcq + sstrideq     ]     ;B
    lea                    srcq, [srcq + sstrideq * 2]
    punpcklbw                m0, m1, m3                     ;A B
    punpckhbw                m1, m3                         ;A B
    movu                     m5, [srcq]                     ;C
    punpcklbw                m2, m3, m5                     ;A B next iter
    punpckhbw                m3, m5                         ;A B next iter
    mova                   tmp0, m2                         ;store to stack
    mova                   tmp1, m3                         ;store to stack
    movu                     m7, [srcq + sstrideq]          ;D
    lea                    srcq, [srcq + sstrideq * 2]
    punpcklbw                m4, m5, m7                     ;C D
    punpckhbw                m5, m7                         ;C D
    movu                     m9, [srcq]                     ;E
    punpcklbw                m6, m7, m9                     ;C D next iter
    punpckhbw                m7, m9                         ;C D next iter
    movu                    m11, [srcq + sstrideq]          ;F
    lea                    srcq, [srcq + sstrideq * 2]
    punpcklbw                m8, m9, m11                    ;E F
    punpckhbw                m9, m11                        ;E F
    movu                     m2, [srcq]                     ;G
    punpcklbw               m10, m11, m2                    ;E F next iter
    punpckhbw               m11, m2                         ;E F next iter

.loop:
    ;Do two rows at once
    pmaddubsw               m13, m0, k0k1
    mova                     m0, m4
    pmaddubsw               m14, m8, k4k5
    pmaddubsw               m15, m4, k2k3
    mova                     m4, m8
    paddsw                  m13, m14
    movu                     m3, [srcq + sstrideq]          ;H
    lea                    srcq, [srcq + sstrideq * 2]
    punpcklbw               m14, m2, m3                     ;G H
    mova                     m8, m14
    pmaddubsw               m14, k6k7
    paddsw                  m15, m14
    paddsw                  m13, m15
    paddsw                  m13, krd
    psraw                   m13, 7

    pmaddubsw               m14, m1, k0k1
    pmaddubsw                m1, m9, k4k5
    pmaddubsw               m15, m5, k2k3
    paddsw                  m14, m1
    mova                     m1, m5
    mova                     m5, m9
    punpckhbw                m2, m3                         ;G H
    mova                     m9, m2
    pmaddubsw                m2, k6k7
    paddsw                  m15, m2
    paddsw                  m14, m15
    paddsw                  m14, krd
    psraw                   m14, 7
    packuswb                m13, m14
%ifidn %1, v8_avg
    pavgb                   m13, [dstq]
%endif
    mova                 [dstq], m13

    ; next iter
    pmaddubsw               m15, tmp0, k0k1
    pmaddubsw               m14, m10, k4k5
    pmaddubsw               m13, m6, k2k3
    paddsw                  m15, m14
    mova                   tmp0, m6
    mova                     m6, m10
    movu                     m2, [srcq]                     ;G next iter
    punpcklbw               m14, m3, m2                     ;G H next iter
    mova                    m10, m14
    pmaddubsw               m14, k6k7
    paddsw                  m13, m14
    paddsw                  m15, m13
    paddsw                  m15, krd
    psraw                   m15, 7

    pmaddubsw               m14, tmp1, k0k1
    mova                   tmp1, m7
    pmaddubsw               m13, m7, k2k3
    mova                     m7, m11
    pmaddubsw               m11, k4k5
    paddsw                  m14, m11
    punpckhbw                m3, m2                         ;G H next iter
    mova                    m11, m3
    pmaddubsw                m3, k6k7
    paddsw                  m13, m3
    paddsw                  m14, m13
    paddsw                  m14, krd
    psraw                   m14, 7
    packuswb                m15, m14
%ifidn %1, v8_avg
    pavgb                   m15, [dstq + dstrideq]
%endif
    mova      [dstq + dstrideq], m15
    lea                    dstq, [dstq + dstrideq * 2]
    sub                 heightd, 2
    jg                    .loop

    ; Do last row if output_height is odd
    jne                   .done

    movu                     m3, [srcq + sstrideq]          ;H
    punpcklbw                m6, m2, m3                     ;G H
    punpckhbw                m2, m3                         ;G H
    pmaddubsw                m0, k0k1
    pmaddubsw                m1, k0k1
    pmaddubsw                m4, k2k3
    pmaddubsw                m5, k2k3
    pmaddubsw                m8, k4k5
    pmaddubsw                m9, k4k5
    pmaddubsw                m6, k6k7
    pmaddubsw                m2, k6k7
    paddsw                   m0, m8
    paddsw                   m1, m9
    paddsw                   m4, m6
    paddsw                   m5, m2
    paddsw                   m0, m4
    paddsw                   m1, m5
    paddsw                   m0, krd
    paddsw                   m1, krd
    psraw                    m0, 7
    psraw                    m1, 7
    packuswb                 m0, m1
%ifidn %1, v8_avg
    pavgb                    m0, [dstq]
%endif
    mova                 [dstq], m0

.done:
    REP_RET

%endif ; ARCH_X86_64

%endm

INIT_XMM ssse3
SUBPIX_VFILTER16     v8
SUBPIX_VFILTER       v8, 8
SUBPIX_VFILTER       v8, 4