summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/fast_dct32-inl.h
blob: 0f3b31cfea1bda006af9a7ed839a5a2f3c0227df (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

/* This file is automatically generated. Do not modify it directly. */
#if HWY_TARGET != HWY_NEON
#error "only include this file from fast_dct-inl.h"
#endif

constexpr size_t FastIDCTIntegerBits(FastDCTTag<32>) { return 1; }

void FastIDCT(FastDCTTag<32>, const int16_t* in, size_t in_stride, int16_t* out,
              size_t out_stride, size_t count) {
  JXL_ASSERT(count % 8 == 0);
  for (size_t i = 0; i < count; i += 8) {
    int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
    int16x8_t v1 = vld1q_s16(in + in_stride * 16 + i);
    int16x8_t v2 = vaddq_s16(v0, v1);
    int16x8_t v3 = vld1q_s16(in + in_stride * 8 + i);
    int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
    int16x8_t v4 = vaddq_s16(v4_tmp, v3);
    int16x8_t v5 = vld1q_s16(in + in_stride * 24 + i);
    int16x8_t v6 = vaddq_s16(v5, v3);
    int16x8_t v7 = vaddq_s16(v4, v6);
    int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
    int16x8_t v9 = vaddq_s16(v2, v8);
    int16x8_t v10 = vld1q_s16(in + in_stride * 4 + i);
    int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
    int16x8_t v11 = vaddq_s16(v11_tmp, v10);
    int16x8_t v12 = vld1q_s16(in + in_stride * 20 + i);
    int16x8_t v13 = vld1q_s16(in + in_stride * 12 + i);
    int16x8_t v14 = vaddq_s16(v12, v13);
    int16x8_t v15 = vaddq_s16(v11, v14);
    int16x8_t v16 = vld1q_s16(in + in_stride * 28 + i);
    int16x8_t v17 = vaddq_s16(v16, v12);
    int16x8_t v18 = vaddq_s16(v13, v10);
    int16x8_t v19 = vaddq_s16(v17, v18);
    int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
    int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
    int16x8_t v22 = vaddq_s16(v20, v21);
    int16x8_t v23 = vaddq_s16(v15, v22);
    int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
    int16x8_t v25 = vaddq_s16(v9, v24);
    int16x8_t v26 = vld1q_s16(in + in_stride * 2 + i);
    int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
    int16x8_t v27 = vaddq_s16(v27_tmp, v26);
    int16x8_t v28 = vld1q_s16(in + in_stride * 18 + i);
    int16x8_t v29 = vld1q_s16(in + in_stride * 14 + i);
    int16x8_t v30 = vaddq_s16(v28, v29);
    int16x8_t v31 = vaddq_s16(v27, v30);
    int16x8_t v32 = vld1q_s16(in + in_stride * 10 + i);
    int16x8_t v33 = vld1q_s16(in + in_stride * 6 + i);
    int16x8_t v34 = vaddq_s16(v32, v33);
    int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
    int16x8_t v36 = vld1q_s16(in + in_stride * 26 + i);
    int16x8_t v37 = vld1q_s16(in + in_stride * 22 + i);
    int16x8_t v38 = vaddq_s16(v36, v37);
    int16x8_t v39 = vaddq_s16(v38, v34);
    int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
    int16x8_t v41 = vaddq_s16(v35, v40);
    int16x8_t v42 = vaddq_s16(v31, v41);
    int16x8_t v43 = vaddq_s16(v33, v26);
    int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
    int16x8_t v44 = vaddq_s16(v44_tmp, v43);
    int16x8_t v45 = vaddq_s16(v29, v32);
    int16x8_t v46 = vaddq_s16(v37, v28);
    int16x8_t v47 = vaddq_s16(v45, v46);
    int16x8_t v48 = vaddq_s16(v44, v47);
    int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
    int16x8_t v50 = vld1q_s16(in + in_stride * 30 + i);
    int16x8_t v51 = vaddq_s16(v50, v36);
    int16x8_t v52 = vaddq_s16(v51, v46);
    int16x8_t v53 = vqrdmulhq_n_s16(v52, 17734);
    int16x8_t v54 = vaddq_s16(v45, v43);
    int16x8_t v55_tmp = vqrdmulhq_n_s16(v54, 10045);
    int16x8_t v55 = vaddq_s16(v55_tmp, v54);
    int16x8_t v56 = vaddq_s16(v53, v55);
    int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
    int16x8_t v58 = vaddq_s16(v49, v57);
    int16x8_t v59 = vaddq_s16(v42, v58);
    int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
    int16x8_t v61 = vaddq_s16(v25, v60);
    int16x8_t v62 = vld1q_s16(in + in_stride * 13 + i);
    int16x8_t v63 = vld1q_s16(in + in_stride * 11 + i);
    int16x8_t v64 = vaddq_s16(v62, v63);
    int16x8_t v65 = vld1q_s16(in + in_stride * 5 + i);
    int16x8_t v66 = vld1q_s16(in + in_stride * 3 + i);
    int16x8_t v67 = vaddq_s16(v65, v66);
    int16x8_t v68 = vaddq_s16(v64, v67);
    int16x8_t v69_tmp = vqrdmulhq_n_s16(v68, 10045);
    int16x8_t v69 = vaddq_s16(v69_tmp, v68);
    int16x8_t v70 = vld1q_s16(in + in_stride * 21 + i);
    int16x8_t v71 = vld1q_s16(in + in_stride * 19 + i);
    int16x8_t v72 = vaddq_s16(v70, v71);
    int16x8_t v73 = vld1q_s16(in + in_stride * 29 + i);
    int16x8_t v74 = vld1q_s16(in + in_stride * 27 + i);
    int16x8_t v75 = vaddq_s16(v73, v74);
    int16x8_t v76 = vaddq_s16(v72, v75);
    int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
    int16x8_t v78 = vaddq_s16(v69, v77);
    int16x8_t v79 = vqrdmulhq_n_s16(v78, 16705);
    int16x8_t v80_tmp = vqrdmulhq_n_s16(v67, 13573);
    int16x8_t v80 = vaddq_s16(v80_tmp, v67);
    int16x8_t v81 = vaddq_s16(v64, v72);
    int16x8_t v82 = vaddq_s16(v80, v81);
    int16x8_t v83 = vqrdmulhq_n_s16(v82, 16705);
    int16x8_t v84 = vaddq_s16(v79, v83);
    int16x8_t v85 = vld1q_s16(in + in_stride * 1 + i);
    int16x8_t v86_tmp = vqrdmulhq_n_s16(v85, 13573);
    int16x8_t v86 = vaddq_s16(v86_tmp, v85);
    int16x8_t v87 = vld1q_s16(in + in_stride * 17 + i);
    int16x8_t v88 = vld1q_s16(in + in_stride * 15 + i);
    int16x8_t v89 = vaddq_s16(v87, v88);
    int16x8_t v90 = vaddq_s16(v86, v89);
    int16x8_t v91 = vld1q_s16(in + in_stride * 9 + i);
    int16x8_t v92 = vld1q_s16(in + in_stride * 7 + i);
    int16x8_t v93 = vaddq_s16(v91, v92);
    int16x8_t v94 = vqrdmulhq_n_s16(v93, 25080);
    int16x8_t v95 = vld1q_s16(in + in_stride * 25 + i);
    int16x8_t v96 = vld1q_s16(in + in_stride * 23 + i);
    int16x8_t v97 = vaddq_s16(v95, v96);
    int16x8_t v98 = vaddq_s16(v97, v93);
    int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
    int16x8_t v100 = vaddq_s16(v94, v99);
    int16x8_t v101 = vaddq_s16(v90, v100);
    int16x8_t v102 = vaddq_s16(v84, v101);
    int16x8_t v103 = vaddq_s16(v92, v65);
    int16x8_t v104 = vaddq_s16(v66, v85);
    int16x8_t v105 = vaddq_s16(v103, v104);
    int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
    int16x8_t v106 = vaddq_s16(v106_tmp, v105);
    int16x8_t v107 = vaddq_s16(v96, v70);
    int16x8_t v108 = vaddq_s16(v71, v87);
    int16x8_t v109 = vaddq_s16(v107, v108);
    int16x8_t v110 = vaddq_s16(v63, v91);
    int16x8_t v111 = vaddq_s16(v88, v62);
    int16x8_t v112 = vaddq_s16(v110, v111);
    int16x8_t v113 = vaddq_s16(v109, v112);
    int16x8_t v114 = vaddq_s16(v106, v113);
    int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
    int16x8_t v116 = vaddq_s16(v112, v105);
    int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
    int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
    int16x8_t v119 = vaddq_s16(v74, v95);
    int16x8_t v120 = vld1q_s16(in + in_stride * 31 + i);
    int16x8_t v121 = vaddq_s16(v120, v73);
    int16x8_t v122 = vaddq_s16(v119, v121);
    int16x8_t v123 = vaddq_s16(v122, v109);
    int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
    int16x8_t v125 = vaddq_s16(v118, v124);
    int16x8_t v126 = vaddq_s16(v117, v125);
    int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
    int16x8_t v128 = vaddq_s16(v115, v127);
    int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
    int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
    int16x8_t v130 = vaddq_s16(v130_tmp, v104);
    int16x8_t v131 = vaddq_s16(v108, v111);
    int16x8_t v132 = vaddq_s16(v130, v131);
    int16x8_t v133 = vaddq_s16(v119, v107);
    int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
    int16x8_t v135 = vaddq_s16(v110, v103);
    int16x8_t v136_tmp = vqrdmulhq_n_s16(v135, 10045);
    int16x8_t v136 = vaddq_s16(v136_tmp, v135);
    int16x8_t v137 = vaddq_s16(v134, v136);
    int16x8_t v138 = vaddq_s16(v132, v137);
    int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
    int16x8_t v140 = vaddq_s16(v129, v139);
    int16x8_t v141 = vaddq_s16(v102, v140);
    int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
    int16x8_t v143 = vaddq_s16(v61, v142);
    int16x8_t v144 = vsubq_s16(v0, v1);
    int16x8_t v145 = vsubq_s16(v4, v6);
    int16x8_t v146_tmp = vqrdmulhq_n_s16(v145, 10045);
    int16x8_t v146 = vaddq_s16(v146_tmp, v145);
    int16x8_t v147 = vaddq_s16(v144, v146);
    int16x8_t v148 = vsubq_s16(v11, v14);
    int16x8_t v149 = vqrdmulhq_n_s16(v18, 17734);
    int16x8_t v150_tmp = vqrdmulhq_n_s16(v17, 10045);
    int16x8_t v150 = vaddq_s16(v150_tmp, v17);
    int16x8_t v151 = vsubq_s16(v149, v150);
    int16x8_t v152 = vaddq_s16(v148, v151);
    int16x8_t v153 = vqrdmulhq_n_s16(v152, 19705);
    int16x8_t v154 = vaddq_s16(v147, v153);
    int16x8_t v155 = vsubq_s16(v27, v30);
    int16x8_t v156 = vqrdmulhq_n_s16(v34, 17734);
    int16x8_t v157_tmp = vqrdmulhq_n_s16(v38, 10045);
    int16x8_t v157 = vaddq_s16(v157_tmp, v38);
    int16x8_t v158 = vsubq_s16(v156, v157);
    int16x8_t v159 = vaddq_s16(v155, v158);
    int16x8_t v160 = vqrdmulhq_n_s16(v54, 13573);
    int16x8_t v161 = vsubq_s16(v160, v52);
    int16x8_t v162 = vqrdmulhq_n_s16(v161, 25746);
    int16x8_t v163 = vsubq_s16(v44, v47);
    int16x8_t v164 = vqrdmulhq_n_s16(v163, 19705);
    int16x8_t v165 = vaddq_s16(v162, v164);
    int16x8_t v166 = vaddq_s16(v159, v165);
    int16x8_t v167 = vqrdmulhq_n_s16(v166, 17121);
    int16x8_t v168 = vaddq_s16(v154, v167);
    int16x8_t v169 = vsubq_s16(v86, v89);
    int16x8_t v170 = vqrdmulhq_n_s16(v93, 17734);
    int16x8_t v171_tmp = vqrdmulhq_n_s16(v97, 10045);
    int16x8_t v171 = vaddq_s16(v171_tmp, v97);
    int16x8_t v172 = vsubq_s16(v170, v171);
    int16x8_t v173 = vaddq_s16(v169, v172);
    int16x8_t v174 = vsubq_s16(v80, v81);
    int16x8_t v175 = vqrdmulhq_n_s16(v174, 19705);
    int16x8_t v176 = vqrdmulhq_n_s16(v68, 13573);
    int16x8_t v177 = vsubq_s16(v176, v76);
    int16x8_t v178 = vqrdmulhq_n_s16(v177, 25746);
    int16x8_t v179 = vaddq_s16(v175, v178);
    int16x8_t v180 = vaddq_s16(v173, v179);
    int16x8_t v181 = vsubq_s16(v130, v131);
    int16x8_t v182 = vqrdmulhq_n_s16(v135, 13573);
    int16x8_t v183 = vsubq_s16(v182, v133);
    int16x8_t v184_tmp = vqrdmulhq_n_s16(v183, 10045);
    int16x8_t v184 = vaddq_s16(v184_tmp, v183);
    int16x8_t v185 = vaddq_s16(v181, v184);
    int16x8_t v186 = vqrdmulhq_n_s16(v185, 17121);
    int16x8_t v187 = vqrdmulhq_n_s16(v105, 27867);
    int16x8_t v188 = vqrdmulhq_n_s16(v113, 19705);
    int16x8_t v189 = vsubq_s16(v187, v188);
    int16x8_t v190 = vqrdmulhq_n_s16(v116, 13573);
    int16x8_t v191 = vsubq_s16(v190, v123);
    int16x8_t v192 = vqrdmulhq_n_s16(v191, 25746);
    int16x8_t v193 = vaddq_s16(v189, v192);
    int16x8_t v194 = vqrdmulhq_n_s16(v193, 17121);
    int16x8_t v195 = vaddq_s16(v186, v194);
    int16x8_t v196 = vaddq_s16(v180, v195);
    int16x8_t v197 = vqrdmulhq_n_s16(v196, 16563);
    int16x8_t v198 = vaddq_s16(v168, v197);
    int16x8_t v199 = vsubq_s16(v144, v146);
    int16x8_t v200 = vsubq_s16(v148, v151);
    int16x8_t v201 = vqrdmulhq_n_s16(v200, 29490);
    int16x8_t v202 = vaddq_s16(v199, v201);
    int16x8_t v203 = vsubq_s16(v155, v158);
    int16x8_t v204 = vqrdmulhq_n_s16(v163, 29490);
    int16x8_t v205_tmp = vqrdmulhq_n_s16(v161, 5763);
    int16x8_t v205 = vaddq_s16(v205_tmp, v161);
    int16x8_t v206 = vsubq_s16(v204, v205);
    int16x8_t v207 = vaddq_s16(v203, v206);
    int16x8_t v208 = vqrdmulhq_n_s16(v207, 18578);
    int16x8_t v209 = vaddq_s16(v202, v208);
    int16x8_t v210 = vsubq_s16(v169, v172);
    int16x8_t v211 = vqrdmulhq_n_s16(v174, 29490);
    int16x8_t v212_tmp = vqrdmulhq_n_s16(v177, 5763);
    int16x8_t v212 = vaddq_s16(v212_tmp, v177);
    int16x8_t v213 = vsubq_s16(v211, v212);
    int16x8_t v214 = vaddq_s16(v210, v213);
    int16x8_t v215 = vsubq_s16(v181, v184);
    int16x8_t v216 = vqrdmulhq_n_s16(v215, 18578);
    int16x8_t v217 = vqrdmulhq_n_s16(v189, 27803);
    int16x8_t v218 = vqrdmulhq_n_s16(v191, 21845);
    int16x8_t v219 = vsubq_s16(v217, v218);
    int16x8_t v220 = vaddq_s16(v216, v219);
    int16x8_t v221 = vaddq_s16(v214, v220);
    int16x8_t v222 = vqrdmulhq_n_s16(v221, 16890);
    int16x8_t v223 = vaddq_s16(v209, v222);
    int16x8_t v224 = vsubq_s16(v2, v8);
    int16x8_t v225 = vsubq_s16(v15, v22);
    int16x8_t v226_tmp = vqrdmulhq_n_s16(v225, 18446);
    int16x8_t v226 = vmlaq_n_s16(v226_tmp, v225, 2);
    int16x8_t v227 = vaddq_s16(v224, v226);
    int16x8_t v228 = vsubq_s16(v31, v41);
    int16x8_t v229 = vsubq_s16(v48, v56);
    int16x8_t v230_tmp = vqrdmulhq_n_s16(v229, 18446);
    int16x8_t v230 = vmlaq_n_s16(v230_tmp, v229, 2);
    int16x8_t v231 = vaddq_s16(v228, v230);
    int16x8_t v232 = vqrdmulhq_n_s16(v231, 21195);
    int16x8_t v233 = vaddq_s16(v227, v232);
    int16x8_t v234 = vsubq_s16(v82, v78);
    int16x8_t v235_tmp = vqrdmulhq_n_s16(v234, 18446);
    int16x8_t v235 = vmlaq_n_s16(v235_tmp, v234, 2);
    int16x8_t v236 = vsubq_s16(v90, v100);
    int16x8_t v237 = vaddq_s16(v235, v236);
    int16x8_t v238 = vsubq_s16(v132, v137);
    int16x8_t v239 = vsubq_s16(v114, v126);
    int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 18446);
    int16x8_t v240 = vmlaq_n_s16(v240_tmp, v239, 2);
    int16x8_t v241 = vaddq_s16(v238, v240);
    int16x8_t v242 = vqrdmulhq_n_s16(v241, 21195);
    int16x8_t v243 = vaddq_s16(v237, v242);
    int16x8_t v244 = vqrdmulhq_n_s16(v243, 17401);
    int16x8_t v245 = vaddq_s16(v233, v244);
    int16x8_t v246 = vsubq_s16(v228, v230);
    int16x8_t v247 = vqrdmulhq_n_s16(v246, 25826);
    int16x8_t v248 = vsubq_s16(v224, v226);
    int16x8_t v249 = vaddq_s16(v247, v248);
    int16x8_t v250 = vsubq_s16(v238, v240);
    int16x8_t v251 = vqrdmulhq_n_s16(v250, 25826);
    int16x8_t v252 = vsubq_s16(v236, v235);
    int16x8_t v253 = vaddq_s16(v251, v252);
    int16x8_t v254 = vqrdmulhq_n_s16(v253, 18124);
    int16x8_t v255 = vaddq_s16(v249, v254);
    int16x8_t v256 = vsubq_s16(v199, v201);
    int16x8_t v257 = vsubq_s16(v203, v206);
    int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 1988);
    int16x8_t v258 = vaddq_s16(v258_tmp, v257);
    int16x8_t v259 = vaddq_s16(v256, v258);
    int16x8_t v260 = vsubq_s16(v210, v213);
    int16x8_t v261_tmp = vqrdmulhq_n_s16(v219, 25030);
    int16x8_t v261 = vaddq_s16(v261_tmp, v219);
    int16x8_t v262 = vsubq_s16(v215, v261);
    int16x8_t v263_tmp = vqrdmulhq_n_s16(v262, 1988);
    int16x8_t v263 = vaddq_s16(v263_tmp, v262);
    int16x8_t v264 = vaddq_s16(v260, v263);
    int16x8_t v265 = vqrdmulhq_n_s16(v264, 19102);
    int16x8_t v266 = vaddq_s16(v259, v265);
    int16x8_t v267 = vsubq_s16(v147, v153);
    int16x8_t v268 = vsubq_s16(v159, v165);
    int16x8_t v269_tmp = vqrdmulhq_n_s16(v268, 23673);
    int16x8_t v269 = vaddq_s16(v269_tmp, v268);
    int16x8_t v270 = vaddq_s16(v267, v269);
    int16x8_t v271 = vsubq_s16(v173, v179);
    int16x8_t v272 = vsubq_s16(v185, v193);
    int16x8_t v273_tmp = vqrdmulhq_n_s16(v272, 23673);
    int16x8_t v273 = vaddq_s16(v273_tmp, v272);
    int16x8_t v274 = vaddq_s16(v271, v273);
    int16x8_t v275 = vqrdmulhq_n_s16(v274, 20398);
    int16x8_t v276 = vaddq_s16(v270, v275);
    int16x8_t v277 = vsubq_s16(v9, v24);
    int16x8_t v278 = vsubq_s16(v42, v58);
    int16x8_t v279_tmp = vqrdmulhq_n_s16(v278, 3314);
    int16x8_t v279 = vmlaq_n_s16(v279_tmp, v278, 5);
    int16x8_t v280 = vaddq_s16(v277, v279);
    int16x8_t v281 = vsubq_s16(v138, v128);
    int16x8_t v282_tmp = vqrdmulhq_n_s16(v281, 3314);
    int16x8_t v282 = vmlaq_n_s16(v282_tmp, v281, 5);
    int16x8_t v283 = vsubq_s16(v101, v84);
    int16x8_t v284 = vaddq_s16(v282, v283);
    int16x8_t v285 = vqrdmulhq_n_s16(v284, 22112);
    int16x8_t v286 = vaddq_s16(v280, v285);
    int16x8_t v287 = vsubq_s16(v277, v279);
    int16x8_t v288 = vsubq_s16(v283, v282);
    int16x8_t v289 = vqrdmulhq_n_s16(v288, 24397);
    int16x8_t v290 = vaddq_s16(v287, v289);
    int16x8_t v291 = vsubq_s16(v267, v269);
    int16x8_t v292 = vsubq_s16(v271, v273);
    int16x8_t v293 = vqrdmulhq_n_s16(v292, 27504);
    int16x8_t v294 = vaddq_s16(v291, v293);
    int16x8_t v295 = vsubq_s16(v260, v263);
    int16x8_t v296 = vqrdmulhq_n_s16(v295, 31869);
    int16x8_t v297 = vsubq_s16(v256, v258);
    int16x8_t v298 = vaddq_s16(v296, v297);
    int16x8_t v299 = vsubq_s16(v248, v247);
    int16x8_t v300 = vsubq_s16(v252, v251);
    int16x8_t v301_tmp = vqrdmulhq_n_s16(v300, 5552);
    int16x8_t v301 = vaddq_s16(v301_tmp, v300);
    int16x8_t v302 = vaddq_s16(v299, v301);
    int16x8_t v303 = vsubq_s16(v227, v232);
    int16x8_t v304 = vsubq_s16(v237, v242);
    int16x8_t v305_tmp = vqrdmulhq_n_s16(v304, 15865);
    int16x8_t v305 = vaddq_s16(v305_tmp, v304);
    int16x8_t v306 = vaddq_s16(v303, v305);
    int16x8_t v307 = vsubq_s16(v202, v208);
    int16x8_t v308 = vsubq_s16(v214, v220);
    int16x8_t v309_tmp = vqrdmulhq_n_s16(v308, 1893);
    int16x8_t v309 = vmlaq_n_s16(v309_tmp, v308, 2);
    int16x8_t v310 = vaddq_s16(v307, v309);
    int16x8_t v311 = vsubq_s16(v154, v167);
    int16x8_t v312 = vsubq_s16(v180, v195);
    int16x8_t v313_tmp = vqrdmulhq_n_s16(v312, 13357);
    int16x8_t v313 = vmlaq_n_s16(v313_tmp, v312, 3);
    int16x8_t v314 = vaddq_s16(v311, v313);
    int16x8_t v315 = vsubq_s16(v102, v140);
    int16x8_t v316_tmp = vqrdmulhq_n_s16(v315, 6226);
    int16x8_t v316 = vmlaq_n_s16(v316_tmp, v315, 10);
    int16x8_t v317 = vsubq_s16(v25, v60);
    int16x8_t v318 = vaddq_s16(v316, v317);
    int16x8_t v319 = vsubq_s16(v317, v316);
    int16x8_t v320 = vsubq_s16(v311, v313);
    int16x8_t v321 = vsubq_s16(v307, v309);
    int16x8_t v322 = vsubq_s16(v303, v305);
    int16x8_t v323 = vsubq_s16(v299, v301);
    int16x8_t v324 = vsubq_s16(v297, v296);
    int16x8_t v325 = vsubq_s16(v291, v293);
    int16x8_t v326 = vsubq_s16(v287, v289);
    int16x8_t v327 = vsubq_s16(v280, v285);
    int16x8_t v328 = vsubq_s16(v270, v275);
    int16x8_t v329 = vsubq_s16(v259, v265);
    int16x8_t v330 = vsubq_s16(v249, v254);
    int16x8_t v331 = vsubq_s16(v233, v244);
    int16x8_t v332 = vsubq_s16(v209, v222);
    int16x8_t v333 = vsubq_s16(v168, v197);
    int16x8_t v334 = vsubq_s16(v61, v142);
    vst1q_s16(out + out_stride * 0 + i, v143);
    vst1q_s16(out + out_stride * 1 + i, v198);
    vst1q_s16(out + out_stride * 2 + i, v223);
    vst1q_s16(out + out_stride * 3 + i, v245);
    vst1q_s16(out + out_stride * 4 + i, v255);
    vst1q_s16(out + out_stride * 5 + i, v266);
    vst1q_s16(out + out_stride * 6 + i, v276);
    vst1q_s16(out + out_stride * 7 + i, v286);
    vst1q_s16(out + out_stride * 8 + i, v290);
    vst1q_s16(out + out_stride * 9 + i, v294);
    vst1q_s16(out + out_stride * 10 + i, v298);
    vst1q_s16(out + out_stride * 11 + i, v302);
    vst1q_s16(out + out_stride * 12 + i, v306);
    vst1q_s16(out + out_stride * 13 + i, v310);
    vst1q_s16(out + out_stride * 14 + i, v314);
    vst1q_s16(out + out_stride * 15 + i, v318);
    vst1q_s16(out + out_stride * 16 + i, v319);
    vst1q_s16(out + out_stride * 17 + i, v320);
    vst1q_s16(out + out_stride * 18 + i, v321);
    vst1q_s16(out + out_stride * 19 + i, v322);
    vst1q_s16(out + out_stride * 20 + i, v323);
    vst1q_s16(out + out_stride * 21 + i, v324);
    vst1q_s16(out + out_stride * 22 + i, v325);
    vst1q_s16(out + out_stride * 23 + i, v326);
    vst1q_s16(out + out_stride * 24 + i, v327);
    vst1q_s16(out + out_stride * 25 + i, v328);
    vst1q_s16(out + out_stride * 26 + i, v329);
    vst1q_s16(out + out_stride * 27 + i, v330);
    vst1q_s16(out + out_stride * 28 + i, v331);
    vst1q_s16(out + out_stride * 29 + i, v332);
    vst1q_s16(out + out_stride * 30 + i, v333);
    vst1q_s16(out + out_stride * 31 + i, v334);
  }
}