diff options
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h')
-rw-r--r-- | third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h | 2137 |
1 files changed, 2137 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h new file mode 100644 index 0000000000..1a94d3ee92 --- /dev/null +++ b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h @@ -0,0 +1,2137 @@ +// Copyright (c) the JPEG XL Project Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* This file is automatically generated. Do not modify it directly. */ +#if HWY_TARGET != HWY_NEON +#error "only include this file from fast_dct-inl.h" +#endif + +constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; } + +void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride, + int16_t* out, size_t out_stride, size_t count) { + JXL_ASSERT(count % 8 == 0); + for (size_t i = 0; i < count; i += 8) { + int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); + int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i); + int16x8_t v2 = vaddq_s16(v0, v1); + int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i); + int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); + int16x8_t v4 = vaddq_s16(v4_tmp, v3); + int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i); + int16x8_t v6 = vaddq_s16(v5, v3); + int16x8_t v7 = vaddq_s16(v4, v6); + int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); + int16x8_t v9 = vaddq_s16(v2, v8); + int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i); + int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); + int16x8_t v11 = vaddq_s16(v11_tmp, v10); + int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i); + int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i); + int16x8_t v14 = vaddq_s16(v12, v13); + int16x8_t v15 = vaddq_s16(v11, v14); + int16x8_t v16 = vaddq_s16(v13, v10); + int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); + int16x8_t v17 = vaddq_s16(v17_tmp, v16); + int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i); + int16x8_t v19 = vaddq_s16(v18, v12); + int16x8_t v20 = vaddq_s16(v19, v16); + int16x8_t v21 = vaddq_s16(v17, v20); + int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); + int16x8_t v23 = vaddq_s16(v15, v22); + int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); + int16x8_t v25 = vaddq_s16(v9, v24); + int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i); + int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); + int16x8_t v27 = vaddq_s16(v27_tmp, v26); + int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i); + int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i); + int16x8_t v30 = vaddq_s16(v28, v29); + int16x8_t v31 = vaddq_s16(v27, v30); + int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i); + int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i); + int16x8_t v34 = vaddq_s16(v32, v33); + int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); + int16x8_t v35 = vaddq_s16(v35_tmp, v34); + int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i); + int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i); + int16x8_t v38 = vaddq_s16(v36, v37); + int16x8_t v39 = vaddq_s16(v38, v34); + int16x8_t v40 = vaddq_s16(v35, v39); + int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); + int16x8_t v42 = vaddq_s16(v31, v41); + int16x8_t v43 = vaddq_s16(v33, v26); + int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); + int16x8_t v44 = vaddq_s16(v44_tmp, v43); + int16x8_t v45 = vaddq_s16(v37, v28); + int16x8_t v46 = vaddq_s16(v29, v32); + int16x8_t v47 = vaddq_s16(v45, v46); + int16x8_t v48 = vaddq_s16(v44, v47); + int16x8_t v49 = vaddq_s16(v46, v43); + int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); + int16x8_t v50 = vaddq_s16(v50_tmp, v49); + int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i); + int16x8_t v52 = vaddq_s16(v51, v36); + int16x8_t v53 = vaddq_s16(v52, v45); + int16x8_t v54 = vaddq_s16(v53, v49); + int16x8_t v55 = vaddq_s16(v50, v54); + int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); + int16x8_t v57 = vaddq_s16(v48, v56); + int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); + int16x8_t v59 = vaddq_s16(v42, v58); + int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); + int16x8_t v61 = vaddq_s16(v25, v60); + int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i); + int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); + int16x8_t v63 = vaddq_s16(v63_tmp, v62); + int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i); + int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i); + int16x8_t v66 = vaddq_s16(v64, v65); + int16x8_t v67 = vaddq_s16(v63, v66); + int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i); + int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i); + int16x8_t v70 = vaddq_s16(v68, v69); + int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); + int16x8_t v71 = vaddq_s16(v71_tmp, v70); + int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i); + int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i); + int16x8_t v74 = vaddq_s16(v72, v73); + int16x8_t v75 = vaddq_s16(v74, v70); + int16x8_t v76 = vaddq_s16(v71, v75); + int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); + int16x8_t v78 = vaddq_s16(v67, v77); + int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i); + int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i); + int16x8_t v81 = vaddq_s16(v79, v80); + int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); + int16x8_t v82 = vaddq_s16(v82_tmp, v81); + int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i); + int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i); + int16x8_t v85 = vaddq_s16(v83, v84); + int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i); + int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i); + int16x8_t v88 = vaddq_s16(v86, v87); + int16x8_t v89 = vaddq_s16(v85, v88); + int16x8_t v90 = vaddq_s16(v82, v89); + int16x8_t v91 = vaddq_s16(v88, v81); + int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); + int16x8_t v92 = vaddq_s16(v92_tmp, v91); + int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i); + int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i); + int16x8_t v95 = vaddq_s16(v93, v94); + int16x8_t v96 = vaddq_s16(v95, v85); + int16x8_t v97 = vaddq_s16(v96, v91); + int16x8_t v98 = vaddq_s16(v92, v97); + int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); + int16x8_t v100 = vaddq_s16(v90, v99); + int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); + int16x8_t v102 = vaddq_s16(v78, v101); + int16x8_t v103 = vaddq_s16(v80, v62); + int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); + int16x8_t v104 = vaddq_s16(v104_tmp, v103); + int16x8_t v105 = vaddq_s16(v84, v64); + int16x8_t v106 = vaddq_s16(v65, v86); + int16x8_t v107 = vaddq_s16(v105, v106); + int16x8_t v108 = vaddq_s16(v104, v107); + int16x8_t v109 = vaddq_s16(v87, v68); + int16x8_t v110 = vaddq_s16(v69, v79); + int16x8_t v111 = vaddq_s16(v109, v110); + int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); + int16x8_t v112 = vaddq_s16(v112_tmp, v111); + int16x8_t v113 = vaddq_s16(v94, v72); + int16x8_t v114 = vaddq_s16(v73, v83); + int16x8_t v115 = vaddq_s16(v113, v114); + int16x8_t v116 = vaddq_s16(v115, v111); + int16x8_t v117 = vaddq_s16(v112, v116); + int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); + int16x8_t v119 = vaddq_s16(v108, v118); + int16x8_t v120 = vaddq_s16(v110, v103); + int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); + int16x8_t v121 = vaddq_s16(v121_tmp, v120); + int16x8_t v122 = vaddq_s16(v114, v105); + int16x8_t v123 = vaddq_s16(v106, v109); + int16x8_t v124 = vaddq_s16(v122, v123); + int16x8_t v125 = vaddq_s16(v121, v124); + int16x8_t v126 = vaddq_s16(v123, v120); + int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); + int16x8_t v127 = vaddq_s16(v127_tmp, v126); + int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i); + int16x8_t v129 = vaddq_s16(v128, v93); + int16x8_t v130 = vaddq_s16(v129, v113); + int16x8_t v131 = vaddq_s16(v130, v122); + int16x8_t v132 = vaddq_s16(v131, v126); + int16x8_t v133 = vaddq_s16(v127, v132); + int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); + int16x8_t v135 = vaddq_s16(v125, v134); + int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); + int16x8_t v137 = vaddq_s16(v119, v136); + int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); + int16x8_t v139 = vaddq_s16(v102, v138); + int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); + int16x8_t v141 = vaddq_s16(v61, v140); + int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i); + int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); + int16x8_t v143 = vaddq_s16(v143_tmp, v142); + int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i); + int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i); + int16x8_t v146 = vaddq_s16(v144, v145); + int16x8_t v147 = vaddq_s16(v143, v146); + int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i); + int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i); + int16x8_t v150 = vaddq_s16(v148, v149); + int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); + int16x8_t v151 = vaddq_s16(v151_tmp, v150); + int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i); + int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i); + int16x8_t v154 = vaddq_s16(v152, v153); + int16x8_t v155 = vaddq_s16(v154, v150); + int16x8_t v156 = vaddq_s16(v151, v155); + int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); + int16x8_t v158 = vaddq_s16(v147, v157); + int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i); + int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i); + int16x8_t v161 = vaddq_s16(v159, v160); + int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); + int16x8_t v162 = vaddq_s16(v162_tmp, v161); + int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i); + int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i); + int16x8_t v165 = vaddq_s16(v163, v164); + int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i); + int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i); + int16x8_t v168 = vaddq_s16(v166, v167); + int16x8_t v169 = vaddq_s16(v165, v168); + int16x8_t v170 = vaddq_s16(v162, v169); + int16x8_t v171 = vaddq_s16(v168, v161); + int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); + int16x8_t v172 = vaddq_s16(v172_tmp, v171); + int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i); + int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i); + int16x8_t v175 = vaddq_s16(v173, v174); + int16x8_t v176 = vaddq_s16(v175, v165); + int16x8_t v177 = vaddq_s16(v176, v171); + int16x8_t v178 = vaddq_s16(v172, v177); + int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); + int16x8_t v180 = vaddq_s16(v170, v179); + int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); + int16x8_t v182 = vaddq_s16(v158, v181); + int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i); + int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i); + int16x8_t v185 = vaddq_s16(v183, v184); + int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); + int16x8_t v186 = vaddq_s16(v186_tmp, v185); + int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i); + int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i); + int16x8_t v189 = vaddq_s16(v187, v188); + int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i); + int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i); + int16x8_t v192 = vaddq_s16(v190, v191); + int16x8_t v193 = vaddq_s16(v189, v192); + int16x8_t v194 = vaddq_s16(v186, v193); + int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i); + int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i); + int16x8_t v197 = vaddq_s16(v195, v196); + int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i); + int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i); + int16x8_t v200 = vaddq_s16(v198, v199); + int16x8_t v201 = vaddq_s16(v197, v200); + int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); + int16x8_t v202 = vaddq_s16(v202_tmp, v201); + int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i); + int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i); + int16x8_t v205 = vaddq_s16(v203, v204); + int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i); + int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i); + int16x8_t v208 = vaddq_s16(v206, v207); + int16x8_t v209 = vaddq_s16(v205, v208); + int16x8_t v210 = vaddq_s16(v209, v201); + int16x8_t v211 = vaddq_s16(v202, v210); + int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); + int16x8_t v213 = vaddq_s16(v194, v212); + int16x8_t v214 = vaddq_s16(v200, v185); + int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); + int16x8_t v215 = vaddq_s16(v215_tmp, v214); + int16x8_t v216 = vaddq_s16(v208, v189); + int16x8_t v217 = vaddq_s16(v192, v197); + int16x8_t v218 = vaddq_s16(v216, v217); + int16x8_t v219 = vaddq_s16(v215, v218); + int16x8_t v220 = vaddq_s16(v217, v214); + int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); + int16x8_t v221 = vaddq_s16(v221_tmp, v220); + int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i); + int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i); + int16x8_t v224 = vaddq_s16(v222, v223); + int16x8_t v225 = vaddq_s16(v224, v205); + int16x8_t v226 = vaddq_s16(v225, v216); + int16x8_t v227 = vaddq_s16(v226, v220); + int16x8_t v228 = vaddq_s16(v221, v227); + int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); + int16x8_t v230 = vaddq_s16(v219, v229); + int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); + int16x8_t v232 = vaddq_s16(v213, v231); + int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); + int16x8_t v234 = vaddq_s16(v182, v233); + int16x8_t v235 = vaddq_s16(v184, v142); + int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); + int16x8_t v236 = vaddq_s16(v236_tmp, v235); + int16x8_t v237 = vaddq_s16(v188, v144); + int16x8_t v238 = vaddq_s16(v145, v190); + int16x8_t v239 = vaddq_s16(v237, v238); + int16x8_t v240 = vaddq_s16(v236, v239); + int16x8_t v241 = vaddq_s16(v196, v148); + int16x8_t v242 = vaddq_s16(v149, v198); + int16x8_t v243 = vaddq_s16(v241, v242); + int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); + int16x8_t v244 = vaddq_s16(v244_tmp, v243); + int16x8_t v245 = vaddq_s16(v204, v152); + int16x8_t v246 = vaddq_s16(v153, v206); + int16x8_t v247 = vaddq_s16(v245, v246); + int16x8_t v248 = vaddq_s16(v247, v243); + int16x8_t v249 = vaddq_s16(v244, v248); + int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); + int16x8_t v251 = vaddq_s16(v240, v250); + int16x8_t v252 = vaddq_s16(v199, v159); + int16x8_t v253 = vaddq_s16(v160, v183); + int16x8_t v254 = vaddq_s16(v252, v253); + int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); + int16x8_t v255 = vaddq_s16(v255_tmp, v254); + int16x8_t v256 = vaddq_s16(v207, v163); + int16x8_t v257 = vaddq_s16(v164, v187); + int16x8_t v258 = vaddq_s16(v256, v257); + int16x8_t v259 = vaddq_s16(v191, v166); + int16x8_t v260 = vaddq_s16(v167, v195); + int16x8_t v261 = vaddq_s16(v259, v260); + int16x8_t v262 = vaddq_s16(v258, v261); + int16x8_t v263 = vaddq_s16(v255, v262); + int16x8_t v264 = vaddq_s16(v261, v254); + int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); + int16x8_t v265 = vaddq_s16(v265_tmp, v264); + int16x8_t v266 = vaddq_s16(v223, v173); + int16x8_t v267 = vaddq_s16(v174, v203); + int16x8_t v268 = vaddq_s16(v266, v267); + int16x8_t v269 = vaddq_s16(v268, v258); + int16x8_t v270 = vaddq_s16(v269, v264); + int16x8_t v271 = vaddq_s16(v265, v270); + int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); + int16x8_t v273 = vaddq_s16(v263, v272); + int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); + int16x8_t v275 = vaddq_s16(v251, v274); + int16x8_t v276 = vaddq_s16(v253, v235); + int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); + int16x8_t v277 = vaddq_s16(v277_tmp, v276); + int16x8_t v278 = vaddq_s16(v257, v237); + int16x8_t v279 = vaddq_s16(v238, v259); + int16x8_t v280 = vaddq_s16(v278, v279); + int16x8_t v281 = vaddq_s16(v277, v280); + int16x8_t v282 = vaddq_s16(v260, v241); + int16x8_t v283 = vaddq_s16(v242, v252); + int16x8_t v284 = vaddq_s16(v282, v283); + int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); + int16x8_t v285 = vaddq_s16(v285_tmp, v284); + int16x8_t v286 = vaddq_s16(v267, v245); + int16x8_t v287 = vaddq_s16(v246, v256); + int16x8_t v288 = vaddq_s16(v286, v287); + int16x8_t v289 = vaddq_s16(v288, v284); + int16x8_t v290 = vaddq_s16(v285, v289); + int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); + int16x8_t v292 = vaddq_s16(v281, v291); + int16x8_t v293 = vaddq_s16(v283, v276); + int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); + int16x8_t v294 = vaddq_s16(v294_tmp, v293); + int16x8_t v295 = vaddq_s16(v287, v278); + int16x8_t v296 = vaddq_s16(v279, v282); + int16x8_t v297 = vaddq_s16(v295, v296); + int16x8_t v298 = vaddq_s16(v294, v297); + int16x8_t v299 = vaddq_s16(v296, v293); + int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); + int16x8_t v300 = vaddq_s16(v300_tmp, v299); + int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i); + int16x8_t v302 = vaddq_s16(v301, v222); + int16x8_t v303 = vaddq_s16(v302, v266); + int16x8_t v304 = vaddq_s16(v303, v286); + int16x8_t v305 = vaddq_s16(v304, v295); + int16x8_t v306 = vaddq_s16(v305, v299); + int16x8_t v307 = vaddq_s16(v300, v306); + int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); + int16x8_t v309 = vaddq_s16(v298, v308); + int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); + int16x8_t v311 = vaddq_s16(v292, v310); + int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); + int16x8_t v313 = vaddq_s16(v275, v312); + int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); + int16x8_t v315 = vaddq_s16(v234, v314); + int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); + int16x8_t v317 = vaddq_s16(v141, v316); + int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i); + int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); + int16x8_t v319 = vaddq_s16(v319_tmp, v318); + int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i); + int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i); + int16x8_t v322 = vaddq_s16(v320, v321); + int16x8_t v323 = vaddq_s16(v319, v322); + int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i); + int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i); + int16x8_t v326 = vaddq_s16(v324, v325); + int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); + int16x8_t v327 = vaddq_s16(v327_tmp, v326); + int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i); + int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i); + int16x8_t v330 = vaddq_s16(v328, v329); + int16x8_t v331 = vaddq_s16(v330, v326); + int16x8_t v332 = vaddq_s16(v327, v331); + int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); + int16x8_t v334 = vaddq_s16(v323, v333); + int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i); + int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i); + int16x8_t v337 = vaddq_s16(v335, v336); + int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); + int16x8_t v338 = vaddq_s16(v338_tmp, v337); + int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i); + int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i); + int16x8_t v341 = vaddq_s16(v339, v340); + int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i); + int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i); + int16x8_t v344 = vaddq_s16(v342, v343); + int16x8_t v345 = vaddq_s16(v341, v344); + int16x8_t v346 = vaddq_s16(v338, v345); + int16x8_t v347 = vaddq_s16(v344, v337); + int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); + int16x8_t v348 = vaddq_s16(v348_tmp, v347); + int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i); + int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i); + int16x8_t v351 = vaddq_s16(v349, v350); + int16x8_t v352 = vaddq_s16(v351, v341); + int16x8_t v353 = vaddq_s16(v352, v347); + int16x8_t v354 = vaddq_s16(v348, v353); + int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); + int16x8_t v356 = vaddq_s16(v346, v355); + int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); + int16x8_t v358 = vaddq_s16(v334, v357); + int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i); + int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i); + int16x8_t v361 = vaddq_s16(v359, v360); + int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); + int16x8_t v362 = vaddq_s16(v362_tmp, v361); + int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i); + int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i); + int16x8_t v365 = vaddq_s16(v363, v364); + int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i); + int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i); + int16x8_t v368 = vaddq_s16(v366, v367); + int16x8_t v369 = vaddq_s16(v365, v368); + int16x8_t v370 = vaddq_s16(v362, v369); + int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i); + int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i); + int16x8_t v373 = vaddq_s16(v371, v372); + int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i); + int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i); + int16x8_t v376 = vaddq_s16(v374, v375); + int16x8_t v377 = vaddq_s16(v373, v376); + int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); + int16x8_t v378 = vaddq_s16(v378_tmp, v377); + int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i); + int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i); + int16x8_t v381 = vaddq_s16(v379, v380); + int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i); + int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i); + int16x8_t v384 = vaddq_s16(v382, v383); + int16x8_t v385 = vaddq_s16(v381, v384); + int16x8_t v386 = vaddq_s16(v385, v377); + int16x8_t v387 = vaddq_s16(v378, v386); + int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); + int16x8_t v389 = vaddq_s16(v370, v388); + int16x8_t v390 = vaddq_s16(v376, v361); + int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); + int16x8_t v391 = vaddq_s16(v391_tmp, v390); + int16x8_t v392 = vaddq_s16(v384, v365); + int16x8_t v393 = vaddq_s16(v368, v373); + int16x8_t v394 = vaddq_s16(v392, v393); + int16x8_t v395 = vaddq_s16(v391, v394); + int16x8_t v396 = vaddq_s16(v393, v390); + int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); + int16x8_t v397 = vaddq_s16(v397_tmp, v396); + int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i); + int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i); + int16x8_t v400 = vaddq_s16(v398, v399); + int16x8_t v401 = vaddq_s16(v400, v381); + int16x8_t v402 = vaddq_s16(v401, v392); + int16x8_t v403 = vaddq_s16(v402, v396); + int16x8_t v404 = vaddq_s16(v397, v403); + int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); + int16x8_t v406 = vaddq_s16(v395, v405); + int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); + int16x8_t v408 = vaddq_s16(v389, v407); + int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); + int16x8_t v410 = vaddq_s16(v358, v409); + int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i); + int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i); + int16x8_t v413 = vaddq_s16(v411, v412); + int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); + int16x8_t v414 = vaddq_s16(v414_tmp, v413); + int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i); + int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i); + int16x8_t v417 = vaddq_s16(v415, v416); + int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i); + int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i); + int16x8_t v420 = vaddq_s16(v418, v419); + int16x8_t v421 = vaddq_s16(v417, v420); + int16x8_t v422 = vaddq_s16(v414, v421); + int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i); + int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i); + int16x8_t v425 = vaddq_s16(v423, v424); + int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i); + int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i); + int16x8_t v428 = vaddq_s16(v426, v427); + int16x8_t v429 = vaddq_s16(v425, v428); + int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); + int16x8_t v430 = vaddq_s16(v430_tmp, v429); + int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i); + int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i); + int16x8_t v433 = vaddq_s16(v431, v432); + int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i); + int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i); + int16x8_t v436 = vaddq_s16(v434, v435); + int16x8_t v437 = vaddq_s16(v433, v436); + int16x8_t v438 = vaddq_s16(v437, v429); + int16x8_t v439 = vaddq_s16(v430, v438); + int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); + int16x8_t v441 = vaddq_s16(v422, v440); + int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i); + int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i); + int16x8_t v444 = vaddq_s16(v442, v443); + int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i); + int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i); + int16x8_t v447 = vaddq_s16(v445, v446); + int16x8_t v448 = vaddq_s16(v444, v447); + int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); + int16x8_t v449 = vaddq_s16(v449_tmp, v448); + int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i); + int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i); + int16x8_t v452 = vaddq_s16(v450, v451); + int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i); + int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i); + int16x8_t v455 = vaddq_s16(v453, v454); + int16x8_t v456 = vaddq_s16(v452, v455); + int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i); + int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i); + int16x8_t v459 = vaddq_s16(v457, v458); + int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i); + int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i); + int16x8_t v462 = vaddq_s16(v460, v461); + int16x8_t v463 = vaddq_s16(v459, v462); + int16x8_t v464 = vaddq_s16(v456, v463); + int16x8_t v465 = vaddq_s16(v449, v464); + int16x8_t v466 = vaddq_s16(v463, v448); + int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); + int16x8_t v467 = vaddq_s16(v467_tmp, v466); + int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i); + int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i); + int16x8_t v470 = vaddq_s16(v468, v469); + int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i); + int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i); + int16x8_t v473 = vaddq_s16(v471, v472); + int16x8_t v474 = vaddq_s16(v470, v473); + int16x8_t v475 = vaddq_s16(v474, v456); + int16x8_t v476 = vaddq_s16(v475, v466); + int16x8_t v477 = vaddq_s16(v467, v476); + int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); + int16x8_t v479 = vaddq_s16(v465, v478); + int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); + int16x8_t v481 = vaddq_s16(v441, v480); + int16x8_t v482 = vaddq_s16(v447, v413); + int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); + int16x8_t v483 = vaddq_s16(v483_tmp, v482); + int16x8_t v484 = vaddq_s16(v455, v417); + int16x8_t v485 = vaddq_s16(v420, v459); + int16x8_t v486 = vaddq_s16(v484, v485); + int16x8_t v487 = vaddq_s16(v483, v486); + int16x8_t v488 = vaddq_s16(v462, v425); + int16x8_t v489 = vaddq_s16(v428, v444); + int16x8_t v490 = vaddq_s16(v488, v489); + int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); + int16x8_t v491 = vaddq_s16(v491_tmp, v490); + int16x8_t v492 = vaddq_s16(v473, v433); + int16x8_t v493 = vaddq_s16(v436, v452); + int16x8_t v494 = vaddq_s16(v492, v493); + int16x8_t v495 = vaddq_s16(v494, v490); + int16x8_t v496 = vaddq_s16(v491, v495); + int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); + int16x8_t v498 = vaddq_s16(v487, v497); + int16x8_t v499 = vaddq_s16(v489, v482); + int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); + int16x8_t v500 = vaddq_s16(v500_tmp, v499); + int16x8_t v501 = vaddq_s16(v493, v484); + int16x8_t v502 = vaddq_s16(v485, v488); + int16x8_t v503 = vaddq_s16(v501, v502); + int16x8_t v504 = vaddq_s16(v500, v503); + int16x8_t v505 = vaddq_s16(v502, v499); + int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); + int16x8_t v506 = vaddq_s16(v506_tmp, v505); + int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i); + int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i); + int16x8_t v509 = vaddq_s16(v507, v508); + int16x8_t v510 = vaddq_s16(v509, v470); + int16x8_t v511 = vaddq_s16(v510, v492); + int16x8_t v512 = vaddq_s16(v511, v501); + int16x8_t v513 = vaddq_s16(v512, v505); + int16x8_t v514 = vaddq_s16(v506, v513); + int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); + int16x8_t v516 = vaddq_s16(v504, v515); + int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); + int16x8_t v518 = vaddq_s16(v498, v517); + int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); + int16x8_t v520 = vaddq_s16(v481, v519); + int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); + int16x8_t v522 = vaddq_s16(v410, v521); + int16x8_t v523 = vaddq_s16(v412, v318); + int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); + int16x8_t v524 = vaddq_s16(v524_tmp, v523); + int16x8_t v525 = vaddq_s16(v416, v320); + int16x8_t v526 = vaddq_s16(v321, v418); + int16x8_t v527 = vaddq_s16(v525, v526); + int16x8_t v528 = vaddq_s16(v524, v527); + int16x8_t v529 = vaddq_s16(v424, v324); + int16x8_t v530 = vaddq_s16(v325, v426); + int16x8_t v531 = vaddq_s16(v529, v530); + int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); + int16x8_t v532 = vaddq_s16(v532_tmp, v531); + int16x8_t v533 = vaddq_s16(v432, v328); + int16x8_t v534 = vaddq_s16(v329, v434); + int16x8_t v535 = vaddq_s16(v533, v534); + int16x8_t v536 = vaddq_s16(v535, v531); + int16x8_t v537 = vaddq_s16(v532, v536); + int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); + int16x8_t v539 = vaddq_s16(v528, v538); + int16x8_t v540 = vaddq_s16(v443, v335); + int16x8_t v541 = vaddq_s16(v336, v445); + int16x8_t v542 = vaddq_s16(v540, v541); + int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); + int16x8_t v543 = vaddq_s16(v543_tmp, v542); + int16x8_t v544 = vaddq_s16(v451, v339); + int16x8_t v545 = vaddq_s16(v340, v453); + int16x8_t v546 = vaddq_s16(v544, v545); + int16x8_t v547 = vaddq_s16(v458, v342); + int16x8_t v548 = vaddq_s16(v343, v460); + int16x8_t v549 = vaddq_s16(v547, v548); + int16x8_t v550 = vaddq_s16(v546, v549); + int16x8_t v551 = vaddq_s16(v543, v550); + int16x8_t v552 = vaddq_s16(v549, v542); + int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); + int16x8_t v553 = vaddq_s16(v553_tmp, v552); + int16x8_t v554 = vaddq_s16(v469, v349); + int16x8_t v555 = vaddq_s16(v350, v471); + int16x8_t v556 = vaddq_s16(v554, v555); + int16x8_t v557 = vaddq_s16(v556, v546); + int16x8_t v558 = vaddq_s16(v557, v552); + int16x8_t v559 = vaddq_s16(v553, v558); + int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); + int16x8_t v561 = vaddq_s16(v551, v560); + int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); + int16x8_t v563 = vaddq_s16(v539, v562); + int16x8_t v564 = vaddq_s16(v446, v359); + int16x8_t v565 = vaddq_s16(v360, v411); + int16x8_t v566 = vaddq_s16(v564, v565); + int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); + int16x8_t v567 = vaddq_s16(v567_tmp, v566); + int16x8_t v568 = vaddq_s16(v454, v363); + int16x8_t v569 = vaddq_s16(v364, v415); + int16x8_t v570 = vaddq_s16(v568, v569); + int16x8_t v571 = vaddq_s16(v419, v366); + int16x8_t v572 = vaddq_s16(v367, v457); + int16x8_t v573 = vaddq_s16(v571, v572); + int16x8_t v574 = vaddq_s16(v570, v573); + int16x8_t v575 = vaddq_s16(v567, v574); + int16x8_t v576 = vaddq_s16(v461, v371); + int16x8_t v577 = vaddq_s16(v372, v423); + int16x8_t v578 = vaddq_s16(v576, v577); + int16x8_t v579 = vaddq_s16(v427, v374); + int16x8_t v580 = vaddq_s16(v375, v442); + int16x8_t v581 = vaddq_s16(v579, v580); + int16x8_t v582 = vaddq_s16(v578, v581); + int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); + int16x8_t v583 = vaddq_s16(v583_tmp, v582); + int16x8_t v584 = vaddq_s16(v472, v379); + int16x8_t v585 = vaddq_s16(v380, v431); + int16x8_t v586 = vaddq_s16(v584, v585); + int16x8_t v587 = vaddq_s16(v435, v382); + int16x8_t v588 = vaddq_s16(v383, v450); + int16x8_t v589 = vaddq_s16(v587, v588); + int16x8_t v590 = vaddq_s16(v586, v589); + int16x8_t v591 = vaddq_s16(v590, v582); + int16x8_t v592 = vaddq_s16(v583, v591); + int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); + int16x8_t v594 = vaddq_s16(v575, v593); + int16x8_t v595 = vaddq_s16(v581, v566); + int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); + int16x8_t v596 = vaddq_s16(v596_tmp, v595); + int16x8_t v597 = vaddq_s16(v589, v570); + int16x8_t v598 = vaddq_s16(v573, v578); + int16x8_t v599 = vaddq_s16(v597, v598); + int16x8_t v600 = vaddq_s16(v596, v599); + int16x8_t v601 = vaddq_s16(v598, v595); + int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); + int16x8_t v602 = vaddq_s16(v602_tmp, v601); + int16x8_t v603 = vaddq_s16(v508, v398); + int16x8_t v604 = vaddq_s16(v399, v468); + int16x8_t v605 = vaddq_s16(v603, v604); + int16x8_t v606 = vaddq_s16(v605, v586); + int16x8_t v607 = vaddq_s16(v606, v597); + int16x8_t v608 = vaddq_s16(v607, v601); + int16x8_t v609 = vaddq_s16(v602, v608); + int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); + int16x8_t v611 = vaddq_s16(v600, v610); + int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); + int16x8_t v613 = vaddq_s16(v594, v612); + int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); + int16x8_t v615 = vaddq_s16(v563, v614); + int16x8_t v616 = vaddq_s16(v565, v523); + int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); + int16x8_t v617 = vaddq_s16(v617_tmp, v616); + int16x8_t v618 = vaddq_s16(v569, v525); + int16x8_t v619 = vaddq_s16(v526, v571); + int16x8_t v620 = vaddq_s16(v618, v619); + int16x8_t v621 = vaddq_s16(v617, v620); + int16x8_t v622 = vaddq_s16(v577, v529); + int16x8_t v623 = vaddq_s16(v530, v579); + int16x8_t v624 = vaddq_s16(v622, v623); + int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); + int16x8_t v625 = vaddq_s16(v625_tmp, v624); + int16x8_t v626 = vaddq_s16(v585, v533); + int16x8_t v627 = vaddq_s16(v534, v587); + int16x8_t v628 = vaddq_s16(v626, v627); + int16x8_t v629 = vaddq_s16(v628, v624); + int16x8_t v630 = vaddq_s16(v625, v629); + int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); + int16x8_t v632 = vaddq_s16(v621, v631); + int16x8_t v633 = vaddq_s16(v580, v540); + int16x8_t v634 = vaddq_s16(v541, v564); + int16x8_t v635 = vaddq_s16(v633, v634); + int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); + int16x8_t v636 = vaddq_s16(v636_tmp, v635); + int16x8_t v637 = vaddq_s16(v588, v544); + int16x8_t v638 = vaddq_s16(v545, v568); + int16x8_t v639 = vaddq_s16(v637, v638); + int16x8_t v640 = vaddq_s16(v572, v547); + int16x8_t v641 = vaddq_s16(v548, v576); + int16x8_t v642 = vaddq_s16(v640, v641); + int16x8_t v643 = vaddq_s16(v639, v642); + int16x8_t v644 = vaddq_s16(v636, v643); + int16x8_t v645 = vaddq_s16(v642, v635); + int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); + int16x8_t v646 = vaddq_s16(v646_tmp, v645); + int16x8_t v647 = vaddq_s16(v604, v554); + int16x8_t v648 = vaddq_s16(v555, v584); + int16x8_t v649 = vaddq_s16(v647, v648); + int16x8_t v650 = vaddq_s16(v649, v639); + int16x8_t v651 = vaddq_s16(v650, v645); + int16x8_t v652 = vaddq_s16(v646, v651); + int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); + int16x8_t v654 = vaddq_s16(v644, v653); + int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); + int16x8_t v656 = vaddq_s16(v632, v655); + int16x8_t v657 = vaddq_s16(v634, v616); + int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); + int16x8_t v658 = vaddq_s16(v658_tmp, v657); + int16x8_t v659 = vaddq_s16(v638, v618); + int16x8_t v660 = vaddq_s16(v619, v640); + int16x8_t v661 = vaddq_s16(v659, v660); + int16x8_t v662 = vaddq_s16(v658, v661); + int16x8_t v663 = vaddq_s16(v641, v622); + int16x8_t v664 = vaddq_s16(v623, v633); + int16x8_t v665 = vaddq_s16(v663, v664); + int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); + int16x8_t v666 = vaddq_s16(v666_tmp, v665); + int16x8_t v667 = vaddq_s16(v648, v626); + int16x8_t v668 = vaddq_s16(v627, v637); + int16x8_t v669 = vaddq_s16(v667, v668); + int16x8_t v670 = vaddq_s16(v669, v665); + int16x8_t v671 = vaddq_s16(v666, v670); + int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); + int16x8_t v673 = vaddq_s16(v662, v672); + int16x8_t v674 = vaddq_s16(v664, v657); + int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); + int16x8_t v675 = vaddq_s16(v675_tmp, v674); + int16x8_t v676 = vaddq_s16(v668, v659); + int16x8_t v677 = vaddq_s16(v660, v663); + int16x8_t v678 = vaddq_s16(v676, v677); + int16x8_t v679 = vaddq_s16(v675, v678); + int16x8_t v680 = vaddq_s16(v677, v674); + int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); + int16x8_t v681 = vaddq_s16(v681_tmp, v680); + int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i); + int16x8_t v683 = vaddq_s16(v682, v507); + int16x8_t v684 = vaddq_s16(v683, v603); + int16x8_t v685 = vaddq_s16(v684, v647); + int16x8_t v686 = vaddq_s16(v685, v667); + int16x8_t v687 = vaddq_s16(v686, v676); + int16x8_t v688 = vaddq_s16(v687, v680); + int16x8_t v689 = vaddq_s16(v681, v688); + int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); + int16x8_t v691 = vaddq_s16(v679, v690); + int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); + int16x8_t v693 = vaddq_s16(v673, v692); + int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); + int16x8_t v695 = vaddq_s16(v656, v694); + int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); + int16x8_t v697 = vaddq_s16(v615, v696); + int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); + int16x8_t v699 = vaddq_s16(v522, v698); + int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); + int16x8_t v701 = vaddq_s16(v317, v700); + int16x8_t v702 = vsubq_s16(v0, v1); + int16x8_t v703 = vsubq_s16(v4, v6); + int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045); + int16x8_t v704 = vaddq_s16(v704_tmp, v703); + int16x8_t v705 = vaddq_s16(v702, v704); + int16x8_t v706 = vsubq_s16(v11, v14); + int16x8_t v707 = vsubq_s16(v17, v20); + int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045); + int16x8_t v708 = vaddq_s16(v708_tmp, v707); + int16x8_t v709 = vaddq_s16(v706, v708); + int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705); + int16x8_t v711 = vaddq_s16(v705, v710); + int16x8_t v712 = vsubq_s16(v27, v30); + int16x8_t v713 = vsubq_s16(v35, v39); + int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045); + int16x8_t v714 = vaddq_s16(v714_tmp, v713); + int16x8_t v715 = vaddq_s16(v712, v714); + int16x8_t v716 = vsubq_s16(v44, v47); + int16x8_t v717 = vsubq_s16(v50, v54); + int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045); + int16x8_t v718 = vaddq_s16(v718_tmp, v717); + int16x8_t v719 = vaddq_s16(v716, v718); + int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705); + int16x8_t v721 = vaddq_s16(v715, v720); + int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121); + int16x8_t v723 = vaddq_s16(v711, v722); + int16x8_t v724 = vsubq_s16(v63, v66); + int16x8_t v725 = vsubq_s16(v71, v75); + int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045); + int16x8_t v726 = vaddq_s16(v726_tmp, v725); + int16x8_t v727 = vaddq_s16(v724, v726); + int16x8_t v728 = vsubq_s16(v82, v89); + int16x8_t v729 = vsubq_s16(v92, v97); + int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045); + int16x8_t v730 = vaddq_s16(v730_tmp, v729); + int16x8_t v731 = vaddq_s16(v728, v730); + int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705); + int16x8_t v733 = vaddq_s16(v727, v732); + int16x8_t v734 = vsubq_s16(v104, v107); + int16x8_t v735 = vsubq_s16(v112, v116); + int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045); + int16x8_t v736 = vaddq_s16(v736_tmp, v735); + int16x8_t v737 = vaddq_s16(v734, v736); + int16x8_t v738 = vsubq_s16(v121, v124); + int16x8_t v739 = vsubq_s16(v127, v132); + int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045); + int16x8_t v740 = vaddq_s16(v740_tmp, v739); + int16x8_t v741 = vaddq_s16(v738, v740); + int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705); + int16x8_t v743 = vaddq_s16(v737, v742); + int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121); + int16x8_t v745 = vaddq_s16(v733, v744); + int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563); + int16x8_t v747 = vaddq_s16(v723, v746); + int16x8_t v748 = vsubq_s16(v143, v146); + int16x8_t v749 = vsubq_s16(v151, v155); + int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045); + int16x8_t v750 = vaddq_s16(v750_tmp, v749); + int16x8_t v751 = vaddq_s16(v748, v750); + int16x8_t v752 = vsubq_s16(v162, v169); + int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705); + int16x8_t v754 = vsubq_s16(v172, v177); + int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746); + int16x8_t v756 = vaddq_s16(v753, v755); + int16x8_t v757 = vaddq_s16(v751, v756); + int16x8_t v758 = vsubq_s16(v186, v193); + int16x8_t v759 = vsubq_s16(v202, v210); + int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045); + int16x8_t v760 = vaddq_s16(v760_tmp, v759); + int16x8_t v761 = vaddq_s16(v758, v760); + int16x8_t v762 = vsubq_s16(v215, v218); + int16x8_t v763 = vsubq_s16(v221, v227); + int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045); + int16x8_t v764 = vaddq_s16(v764_tmp, v763); + int16x8_t v765 = vaddq_s16(v762, v764); + int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705); + int16x8_t v767 = vaddq_s16(v761, v766); + int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121); + int16x8_t v769 = vaddq_s16(v757, v768); + int16x8_t v770 = vsubq_s16(v236, v239); + int16x8_t v771 = vsubq_s16(v244, v248); + int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045); + int16x8_t v772 = vaddq_s16(v772_tmp, v771); + int16x8_t v773 = vaddq_s16(v770, v772); + int16x8_t v774 = vsubq_s16(v255, v262); + int16x8_t v775 = vsubq_s16(v265, v270); + int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045); + int16x8_t v776 = vaddq_s16(v776_tmp, v775); + int16x8_t v777 = vaddq_s16(v774, v776); + int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705); + int16x8_t v779 = vaddq_s16(v773, v778); + int16x8_t v780 = vsubq_s16(v277, v280); + int16x8_t v781 = vsubq_s16(v285, v289); + int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045); + int16x8_t v782 = vaddq_s16(v782_tmp, v781); + int16x8_t v783 = vaddq_s16(v780, v782); + int16x8_t v784 = vsubq_s16(v294, v297); + int16x8_t v785 = vsubq_s16(v300, v306); + int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045); + int16x8_t v786 = vaddq_s16(v786_tmp, v785); + int16x8_t v787 = vaddq_s16(v784, v786); + int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705); + int16x8_t v789 = vaddq_s16(v783, v788); + int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121); + int16x8_t v791 = vaddq_s16(v779, v790); + int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563); + int16x8_t v793 = vaddq_s16(v769, v792); + int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429); + int16x8_t v795 = vaddq_s16(v747, v794); + int16x8_t v796 = vsubq_s16(v319, v322); + int16x8_t v797 = vsubq_s16(v327, v331); + int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045); + int16x8_t v798 = vaddq_s16(v798_tmp, v797); + int16x8_t v799 = vaddq_s16(v796, v798); + int16x8_t v800 = vsubq_s16(v338, v345); + int16x8_t v801 = vsubq_s16(v348, v353); + int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045); + int16x8_t v802 = vaddq_s16(v802_tmp, v801); + int16x8_t v803 = vaddq_s16(v800, v802); + int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705); + int16x8_t v805 = vaddq_s16(v799, v804); + int16x8_t v806 = vsubq_s16(v362, v369); + int16x8_t v807 = vsubq_s16(v378, v386); + int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045); + int16x8_t v808 = vaddq_s16(v808_tmp, v807); + int16x8_t v809 = vaddq_s16(v806, v808); + int16x8_t v810 = vsubq_s16(v391, v394); + int16x8_t v811 = vsubq_s16(v397, v403); + int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045); + int16x8_t v812 = vaddq_s16(v812_tmp, v811); + int16x8_t v813 = vaddq_s16(v810, v812); + int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705); + int16x8_t v815 = vaddq_s16(v809, v814); + int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121); + int16x8_t v817 = vaddq_s16(v805, v816); + int16x8_t v818 = vsubq_s16(v414, v421); + int16x8_t v819 = vsubq_s16(v430, v438); + int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045); + int16x8_t v820 = vaddq_s16(v820_tmp, v819); + int16x8_t v821 = vaddq_s16(v818, v820); + int16x8_t v822 = vsubq_s16(v449, v464); + int16x8_t v823 = vsubq_s16(v467, v476); + int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045); + int16x8_t v824 = vaddq_s16(v824_tmp, v823); + int16x8_t v825 = vaddq_s16(v822, v824); + int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705); + int16x8_t v827 = vaddq_s16(v821, v826); + int16x8_t v828 = vsubq_s16(v483, v486); + int16x8_t v829 = vsubq_s16(v491, v495); + int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045); + int16x8_t v830 = vaddq_s16(v830_tmp, v829); + int16x8_t v831 = vaddq_s16(v828, v830); + int16x8_t v832 = vsubq_s16(v500, v503); + int16x8_t v833 = vsubq_s16(v506, v513); + int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045); + int16x8_t v834 = vaddq_s16(v834_tmp, v833); + int16x8_t v835 = vaddq_s16(v832, v834); + int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705); + int16x8_t v837 = vaddq_s16(v831, v836); + int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121); + int16x8_t v839 = vaddq_s16(v827, v838); + int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563); + int16x8_t v841 = vaddq_s16(v817, v840); + int16x8_t v842 = vsubq_s16(v524, v527); + int16x8_t v843 = vsubq_s16(v532, v536); + int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045); + int16x8_t v844 = vaddq_s16(v844_tmp, v843); + int16x8_t v845 = vaddq_s16(v842, v844); + int16x8_t v846 = vsubq_s16(v543, v550); + int16x8_t v847 = vsubq_s16(v553, v558); + int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045); + int16x8_t v848 = vaddq_s16(v848_tmp, v847); + int16x8_t v849 = vaddq_s16(v846, v848); + int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705); + int16x8_t v851 = vaddq_s16(v845, v850); + int16x8_t v852 = vsubq_s16(v567, v574); + int16x8_t v853 = vsubq_s16(v583, v591); + int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045); + int16x8_t v854 = vaddq_s16(v854_tmp, v853); + int16x8_t v855 = vaddq_s16(v852, v854); + int16x8_t v856 = vsubq_s16(v596, v599); + int16x8_t v857 = vsubq_s16(v602, v608); + int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045); + int16x8_t v858 = vaddq_s16(v858_tmp, v857); + int16x8_t v859 = vaddq_s16(v856, v858); + int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705); + int16x8_t v861 = vaddq_s16(v855, v860); + int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121); + int16x8_t v863 = vaddq_s16(v851, v862); + int16x8_t v864 = vsubq_s16(v617, v620); + int16x8_t v865 = vsubq_s16(v625, v629); + int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045); + int16x8_t v866 = vaddq_s16(v866_tmp, v865); + int16x8_t v867 = vaddq_s16(v864, v866); + int16x8_t v868 = vsubq_s16(v636, v643); + int16x8_t v869 = vsubq_s16(v646, v651); + int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045); + int16x8_t v870 = vaddq_s16(v870_tmp, v869); + int16x8_t v871 = vaddq_s16(v868, v870); + int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705); + int16x8_t v873 = vaddq_s16(v867, v872); + int16x8_t v874 = vsubq_s16(v658, v661); + int16x8_t v875 = vsubq_s16(v666, v670); + int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045); + int16x8_t v876 = vaddq_s16(v876_tmp, v875); + int16x8_t v877 = vaddq_s16(v874, v876); + int16x8_t v878 = vsubq_s16(v675, v678); + int16x8_t v879 = vsubq_s16(v681, v688); + int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045); + int16x8_t v880 = vaddq_s16(v880_tmp, v879); + int16x8_t v881 = vaddq_s16(v878, v880); + int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705); + int16x8_t v883 = vaddq_s16(v877, v882); + int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121); + int16x8_t v885 = vaddq_s16(v873, v884); + int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563); + int16x8_t v887 = vaddq_s16(v863, v886); + int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429); + int16x8_t v889 = vaddq_s16(v841, v888); + int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395); + int16x8_t v891 = vaddq_s16(v795, v890); + int16x8_t v892 = vsubq_s16(v702, v704); + int16x8_t v893 = vsubq_s16(v706, v708); + int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490); + int16x8_t v895 = vaddq_s16(v892, v894); + int16x8_t v896 = vsubq_s16(v712, v714); + int16x8_t v897 = vsubq_s16(v716, v718); + int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490); + int16x8_t v899 = vaddq_s16(v896, v898); + int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578); + int16x8_t v901 = vaddq_s16(v895, v900); + int16x8_t v902 = vsubq_s16(v724, v726); + int16x8_t v903 = vsubq_s16(v728, v730); + int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490); + int16x8_t v905 = vaddq_s16(v902, v904); + int16x8_t v906 = vsubq_s16(v734, v736); + int16x8_t v907 = vsubq_s16(v738, v740); + int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490); + int16x8_t v909 = vaddq_s16(v906, v908); + int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578); + int16x8_t v911 = vaddq_s16(v905, v910); + int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890); + int16x8_t v913 = vaddq_s16(v901, v912); + int16x8_t v914 = vsubq_s16(v748, v750); + int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045); + int16x8_t v915 = vaddq_s16(v915_tmp, v754); + int16x8_t v916 = vsubq_s16(v752, v915); + int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490); + int16x8_t v918 = vaddq_s16(v914, v917); + int16x8_t v919 = vsubq_s16(v758, v760); + int16x8_t v920 = vsubq_s16(v762, v764); + int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490); + int16x8_t v922 = vaddq_s16(v919, v921); + int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578); + int16x8_t v924 = vaddq_s16(v918, v923); + int16x8_t v925 = vsubq_s16(v770, v772); + int16x8_t v926 = vsubq_s16(v774, v776); + int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490); + int16x8_t v928 = vaddq_s16(v925, v927); + int16x8_t v929 = vsubq_s16(v780, v782); + int16x8_t v930 = vsubq_s16(v784, v786); + int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490); + int16x8_t v932 = vaddq_s16(v929, v931); + int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578); + int16x8_t v934 = vaddq_s16(v928, v933); + int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890); + int16x8_t v936 = vaddq_s16(v924, v935); + int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508); + int16x8_t v938 = vaddq_s16(v913, v937); + int16x8_t v939 = vsubq_s16(v796, v798); + int16x8_t v940 = vsubq_s16(v800, v802); + int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490); + int16x8_t v942 = vaddq_s16(v939, v941); + int16x8_t v943 = vsubq_s16(v806, v808); + int16x8_t v944 = vsubq_s16(v810, v812); + int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490); + int16x8_t v946 = vaddq_s16(v943, v945); + int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578); + int16x8_t v948 = vaddq_s16(v942, v947); + int16x8_t v949 = vsubq_s16(v818, v820); + int16x8_t v950 = vsubq_s16(v822, v824); + int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490); + int16x8_t v952 = vaddq_s16(v949, v951); + int16x8_t v953 = vsubq_s16(v828, v830); + int16x8_t v954 = vsubq_s16(v832, v834); + int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490); + int16x8_t v956 = vaddq_s16(v953, v955); + int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578); + int16x8_t v958 = vaddq_s16(v952, v957); + int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890); + int16x8_t v960 = vaddq_s16(v948, v959); + int16x8_t v961 = vsubq_s16(v842, v844); + int16x8_t v962 = vsubq_s16(v846, v848); + int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490); + int16x8_t v964 = vaddq_s16(v961, v963); + int16x8_t v965 = vsubq_s16(v852, v854); + int16x8_t v966 = vsubq_s16(v856, v858); + int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490); + int16x8_t v968 = vaddq_s16(v965, v967); + int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578); + int16x8_t v970 = vaddq_s16(v964, v969); + int16x8_t v971 = vsubq_s16(v864, v866); + int16x8_t v972 = vsubq_s16(v868, v870); + int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490); + int16x8_t v974 = vaddq_s16(v971, v973); + int16x8_t v975 = vsubq_s16(v874, v876); + int16x8_t v976 = vsubq_s16(v878, v880); + int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490); + int16x8_t v978 = vaddq_s16(v975, v977); + int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578); + int16x8_t v980 = vaddq_s16(v974, v979); + int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890); + int16x8_t v982 = vaddq_s16(v970, v981); + int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508); + int16x8_t v984 = vaddq_s16(v960, v983); + int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415); + int16x8_t v986 = vaddq_s16(v938, v985); + int16x8_t v987 = vsubq_s16(v2, v8); + int16x8_t v988 = vsubq_s16(v15, v22); + int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446); + int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2); + int16x8_t v990 = vaddq_s16(v987, v989); + int16x8_t v991 = vsubq_s16(v31, v41); + int16x8_t v992 = vsubq_s16(v48, v56); + int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446); + int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2); + int16x8_t v994 = vaddq_s16(v991, v993); + int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195); + int16x8_t v996 = vaddq_s16(v990, v995); + int16x8_t v997 = vsubq_s16(v67, v77); + int16x8_t v998 = vsubq_s16(v90, v99); + int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446); + int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2); + int16x8_t v1000 = vaddq_s16(v997, v999); + int16x8_t v1001 = vsubq_s16(v108, v118); + int16x8_t v1002 = vsubq_s16(v125, v134); + int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446); + int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2); + int16x8_t v1004 = vaddq_s16(v1001, v1003); + int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195); + int16x8_t v1006 = vaddq_s16(v1000, v1005); + int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401); + int16x8_t v1008 = vaddq_s16(v996, v1007); + int16x8_t v1009 = vsubq_s16(v147, v157); + int16x8_t v1010 = vsubq_s16(v170, v179); + int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446); + int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2); + int16x8_t v1012 = vaddq_s16(v1009, v1011); + int16x8_t v1013 = vsubq_s16(v194, v212); + int16x8_t v1014 = vsubq_s16(v219, v229); + int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446); + int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2); + int16x8_t v1016 = vaddq_s16(v1013, v1015); + int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195); + int16x8_t v1018 = vaddq_s16(v1012, v1017); + int16x8_t v1019 = vsubq_s16(v240, v250); + int16x8_t v1020 = vsubq_s16(v263, v272); + int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446); + int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2); + int16x8_t v1022 = vaddq_s16(v1019, v1021); + int16x8_t v1023 = vsubq_s16(v281, v291); + int16x8_t v1024 = vsubq_s16(v298, v308); + int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446); + int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2); + int16x8_t v1026 = vaddq_s16(v1023, v1025); + int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195); + int16x8_t v1028 = vaddq_s16(v1022, v1027); + int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401); + int16x8_t v1030 = vaddq_s16(v1018, v1029); + int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629); + int16x8_t v1032 = vaddq_s16(v1008, v1031); + int16x8_t v1033 = vsubq_s16(v323, v333); + int16x8_t v1034 = vsubq_s16(v346, v355); + int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446); + int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2); + int16x8_t v1036 = vaddq_s16(v1033, v1035); + int16x8_t v1037 = vsubq_s16(v370, v388); + int16x8_t v1038 = vsubq_s16(v395, v405); + int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446); + int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2); + int16x8_t v1040 = vaddq_s16(v1037, v1039); + int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195); + int16x8_t v1042 = vaddq_s16(v1036, v1041); + int16x8_t v1043 = vsubq_s16(v422, v440); + int16x8_t v1044 = vsubq_s16(v465, v478); + int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446); + int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2); + int16x8_t v1046 = vaddq_s16(v1043, v1045); + int16x8_t v1047 = vsubq_s16(v487, v497); + int16x8_t v1048 = vsubq_s16(v504, v515); + int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446); + int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2); + int16x8_t v1050 = vaddq_s16(v1047, v1049); + int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195); + int16x8_t v1052 = vaddq_s16(v1046, v1051); + int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401); + int16x8_t v1054 = vaddq_s16(v1042, v1053); + int16x8_t v1055 = vsubq_s16(v528, v538); + int16x8_t v1056 = vsubq_s16(v551, v560); + int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446); + int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2); + int16x8_t v1058 = vaddq_s16(v1055, v1057); + int16x8_t v1059 = vsubq_s16(v575, v593); + int16x8_t v1060 = vsubq_s16(v600, v610); + int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446); + int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2); + int16x8_t v1062 = vaddq_s16(v1059, v1061); + int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195); + int16x8_t v1064 = vaddq_s16(v1058, v1063); + int16x8_t v1065 = vsubq_s16(v621, v631); + int16x8_t v1066 = vsubq_s16(v644, v653); + int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446); + int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2); + int16x8_t v1068 = vaddq_s16(v1065, v1067); + int16x8_t v1069 = vsubq_s16(v662, v672); + int16x8_t v1070 = vsubq_s16(v679, v690); + int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446); + int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2); + int16x8_t v1072 = vaddq_s16(v1069, v1071); + int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195); + int16x8_t v1074 = vaddq_s16(v1068, v1073); + int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401); + int16x8_t v1076 = vaddq_s16(v1064, v1075); + int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629); + int16x8_t v1078 = vaddq_s16(v1054, v1077); + int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445); + int16x8_t v1080 = vaddq_s16(v1032, v1079); + int16x8_t v1081 = vsubq_s16(v987, v989); + int16x8_t v1082 = vsubq_s16(v991, v993); + int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826); + int16x8_t v1084 = vaddq_s16(v1081, v1083); + int16x8_t v1085 = vsubq_s16(v997, v999); + int16x8_t v1086 = vsubq_s16(v1001, v1003); + int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826); + int16x8_t v1088 = vaddq_s16(v1085, v1087); + int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124); + int16x8_t v1090 = vaddq_s16(v1084, v1089); + int16x8_t v1091 = vsubq_s16(v1009, v1011); + int16x8_t v1092 = vsubq_s16(v1013, v1015); + int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826); + int16x8_t v1094 = vaddq_s16(v1091, v1093); + int16x8_t v1095 = vsubq_s16(v1019, v1021); + int16x8_t v1096 = vsubq_s16(v1023, v1025); + int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826); + int16x8_t v1098 = vaddq_s16(v1095, v1097); + int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124); + int16x8_t v1100 = vaddq_s16(v1094, v1099); + int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792); + int16x8_t v1102 = vaddq_s16(v1090, v1101); + int16x8_t v1103 = vsubq_s16(v1033, v1035); + int16x8_t v1104 = vsubq_s16(v1037, v1039); + int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826); + int16x8_t v1106 = vaddq_s16(v1103, v1105); + int16x8_t v1107 = vsubq_s16(v1043, v1045); + int16x8_t v1108 = vsubq_s16(v1047, v1049); + int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826); + int16x8_t v1110 = vaddq_s16(v1107, v1109); + int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124); + int16x8_t v1112 = vaddq_s16(v1106, v1111); + int16x8_t v1113 = vsubq_s16(v1055, v1057); + int16x8_t v1114 = vsubq_s16(v1059, v1061); + int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826); + int16x8_t v1116 = vaddq_s16(v1113, v1115); + int16x8_t v1117 = vsubq_s16(v1065, v1067); + int16x8_t v1118 = vsubq_s16(v1069, v1071); + int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826); + int16x8_t v1120 = vaddq_s16(v1117, v1119); + int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124); + int16x8_t v1122 = vaddq_s16(v1116, v1121); + int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792); + int16x8_t v1124 = vaddq_s16(v1112, v1123); + int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484); + int16x8_t v1126 = vaddq_s16(v1102, v1125); + int16x8_t v1127 = vsubq_s16(v892, v894); + int16x8_t v1128 = vsubq_s16(v896, v898); + int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988); + int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128); + int16x8_t v1130 = vaddq_s16(v1127, v1129); + int16x8_t v1131 = vsubq_s16(v902, v904); + int16x8_t v1132 = vsubq_s16(v906, v908); + int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988); + int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132); + int16x8_t v1134 = vaddq_s16(v1131, v1133); + int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102); + int16x8_t v1136 = vaddq_s16(v1130, v1135); + int16x8_t v1137 = vsubq_s16(v914, v917); + int16x8_t v1138 = vsubq_s16(v919, v921); + int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988); + int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138); + int16x8_t v1140 = vaddq_s16(v1137, v1139); + int16x8_t v1141 = vsubq_s16(v925, v927); + int16x8_t v1142 = vsubq_s16(v929, v931); + int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988); + int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142); + int16x8_t v1144 = vaddq_s16(v1141, v1143); + int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102); + int16x8_t v1146 = vaddq_s16(v1140, v1145); + int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000); + int16x8_t v1148 = vaddq_s16(v1136, v1147); + int16x8_t v1149 = vsubq_s16(v939, v941); + int16x8_t v1150 = vsubq_s16(v943, v945); + int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988); + int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150); + int16x8_t v1152 = vaddq_s16(v1149, v1151); + int16x8_t v1153 = vsubq_s16(v949, v951); + int16x8_t v1154 = vsubq_s16(v953, v955); + int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988); + int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154); + int16x8_t v1156 = vaddq_s16(v1153, v1155); + int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102); + int16x8_t v1158 = vaddq_s16(v1152, v1157); + int16x8_t v1159 = vsubq_s16(v961, v963); + int16x8_t v1160 = vsubq_s16(v965, v967); + int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988); + int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160); + int16x8_t v1162 = vaddq_s16(v1159, v1161); + int16x8_t v1163 = vsubq_s16(v971, v973); + int16x8_t v1164 = vsubq_s16(v975, v977); + int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988); + int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164); + int16x8_t v1166 = vaddq_s16(v1163, v1165); + int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102); + int16x8_t v1168 = vaddq_s16(v1162, v1167); + int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000); + int16x8_t v1170 = vaddq_s16(v1158, v1169); + int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534); + int16x8_t v1172 = vaddq_s16(v1148, v1171); + int16x8_t v1173 = vsubq_s16(v705, v710); + int16x8_t v1174 = vsubq_s16(v715, v720); + int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673); + int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174); + int16x8_t v1176 = vaddq_s16(v1173, v1175); + int16x8_t v1177 = vsubq_s16(v727, v732); + int16x8_t v1178 = vsubq_s16(v737, v742); + int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673); + int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178); + int16x8_t v1180 = vaddq_s16(v1177, v1179); + int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398); + int16x8_t v1182 = vaddq_s16(v1176, v1181); + int16x8_t v1183 = vsubq_s16(v751, v756); + int16x8_t v1184 = vsubq_s16(v761, v766); + int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673); + int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184); + int16x8_t v1186 = vaddq_s16(v1183, v1185); + int16x8_t v1187 = vsubq_s16(v773, v778); + int16x8_t v1188 = vsubq_s16(v783, v788); + int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673); + int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188); + int16x8_t v1190 = vaddq_s16(v1187, v1189); + int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398); + int16x8_t v1192 = vaddq_s16(v1186, v1191); + int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255); + int16x8_t v1194 = vaddq_s16(v1182, v1193); + int16x8_t v1195 = vsubq_s16(v799, v804); + int16x8_t v1196 = vsubq_s16(v809, v814); + int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673); + int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196); + int16x8_t v1198 = vaddq_s16(v1195, v1197); + int16x8_t v1199 = vsubq_s16(v821, v826); + int16x8_t v1200 = vsubq_s16(v831, v836); + int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673); + int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200); + int16x8_t v1202 = vaddq_s16(v1199, v1201); + int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398); + int16x8_t v1204 = vaddq_s16(v1198, v1203); + int16x8_t v1205 = vsubq_s16(v845, v850); + int16x8_t v1206 = vsubq_s16(v855, v860); + int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673); + int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); + int16x8_t v1208 = vaddq_s16(v1205, v1207); + int16x8_t v1209 = vsubq_s16(v867, v872); + int16x8_t v1210 = vsubq_s16(v877, v882); + int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673); + int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210); + int16x8_t v1212 = vaddq_s16(v1209, v1211); + int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398); + int16x8_t v1214 = vaddq_s16(v1208, v1213); + int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255); + int16x8_t v1216 = vaddq_s16(v1204, v1215); + int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595); + int16x8_t v1218 = vaddq_s16(v1194, v1217); + int16x8_t v1219 = vsubq_s16(v9, v24); + int16x8_t v1220 = vsubq_s16(v42, v58); + int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314); + int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5); + int16x8_t v1222 = vaddq_s16(v1219, v1221); + int16x8_t v1223 = vsubq_s16(v78, v101); + int16x8_t v1224 = vsubq_s16(v119, v136); + int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314); + int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5); + int16x8_t v1226 = vaddq_s16(v1223, v1225); + int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112); + int16x8_t v1228 = vaddq_s16(v1222, v1227); + int16x8_t v1229 = vsubq_s16(v158, v181); + int16x8_t v1230 = vsubq_s16(v213, v231); + int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314); + int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5); + int16x8_t v1232 = vaddq_s16(v1229, v1231); + int16x8_t v1233 = vsubq_s16(v251, v274); + int16x8_t v1234 = vsubq_s16(v292, v310); + int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314); + int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5); + int16x8_t v1236 = vaddq_s16(v1233, v1235); + int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112); + int16x8_t v1238 = vaddq_s16(v1232, v1237); + int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561); + int16x8_t v1240 = vaddq_s16(v1228, v1239); + int16x8_t v1241 = vsubq_s16(v334, v357); + int16x8_t v1242 = vsubq_s16(v389, v407); + int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314); + int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5); + int16x8_t v1244 = vaddq_s16(v1241, v1243); + int16x8_t v1245 = vsubq_s16(v441, v480); + int16x8_t v1246 = vsubq_s16(v498, v517); + int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314); + int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5); + int16x8_t v1248 = vaddq_s16(v1245, v1247); + int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112); + int16x8_t v1250 = vaddq_s16(v1244, v1249); + int16x8_t v1251 = vsubq_s16(v539, v562); + int16x8_t v1252 = vsubq_s16(v594, v612); + int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314); + int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5); + int16x8_t v1254 = vaddq_s16(v1251, v1253); + int16x8_t v1255 = vsubq_s16(v632, v655); + int16x8_t v1256 = vsubq_s16(v673, v692); + int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314); + int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5); + int16x8_t v1258 = vaddq_s16(v1255, v1257); + int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112); + int16x8_t v1260 = vaddq_s16(v1254, v1259); + int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561); + int16x8_t v1262 = vaddq_s16(v1250, v1261); + int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666); + int16x8_t v1264 = vaddq_s16(v1240, v1263); + int16x8_t v1265 = vsubq_s16(v1219, v1221); + int16x8_t v1266 = vsubq_s16(v1223, v1225); + int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397); + int16x8_t v1268 = vaddq_s16(v1265, v1267); + int16x8_t v1269 = vsubq_s16(v1229, v1231); + int16x8_t v1270 = vsubq_s16(v1233, v1235); + int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397); + int16x8_t v1272 = vaddq_s16(v1269, v1271); + int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921); + int16x8_t v1274 = vaddq_s16(v1268, v1273); + int16x8_t v1275 = vsubq_s16(v1241, v1243); + int16x8_t v1276 = vsubq_s16(v1245, v1247); + int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397); + int16x8_t v1278 = vaddq_s16(v1275, v1277); + int16x8_t v1279 = vsubq_s16(v1251, v1253); + int16x8_t v1280 = vsubq_s16(v1255, v1257); + int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397); + int16x8_t v1282 = vaddq_s16(v1279, v1281); + int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921); + int16x8_t v1284 = vaddq_s16(v1278, v1283); + int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747); + int16x8_t v1286 = vaddq_s16(v1274, v1285); + int16x8_t v1287 = vsubq_s16(v1173, v1175); + int16x8_t v1288 = vsubq_s16(v1177, v1179); + int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504); + int16x8_t v1290 = vaddq_s16(v1287, v1289); + int16x8_t v1291 = vsubq_s16(v1183, v1185); + int16x8_t v1292 = vsubq_s16(v1187, v1189); + int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504); + int16x8_t v1294 = vaddq_s16(v1291, v1293); + int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343); + int16x8_t v1296 = vaddq_s16(v1290, v1295); + int16x8_t v1297 = vsubq_s16(v1195, v1197); + int16x8_t v1298 = vsubq_s16(v1199, v1201); + int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504); + int16x8_t v1300 = vaddq_s16(v1297, v1299); + int16x8_t v1301 = vsubq_s16(v1205, v1207); + int16x8_t v1302 = vsubq_s16(v1209, v1211); + int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504); + int16x8_t v1304 = vaddq_s16(v1301, v1303); + int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343); + int16x8_t v1306 = vaddq_s16(v1300, v1305); + int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840); + int16x8_t v1308 = vaddq_s16(v1296, v1307); + int16x8_t v1309 = vsubq_s16(v1127, v1129); + int16x8_t v1310 = vsubq_s16(v1131, v1133); + int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869); + int16x8_t v1312 = vaddq_s16(v1309, v1311); + int16x8_t v1313 = vsubq_s16(v1137, v1139); + int16x8_t v1314 = vsubq_s16(v1141, v1143); + int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869); + int16x8_t v1316 = vaddq_s16(v1313, v1315); + int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830); + int16x8_t v1318 = vaddq_s16(v1312, v1317); + int16x8_t v1319 = vsubq_s16(v1149, v1151); + int16x8_t v1320 = vsubq_s16(v1153, v1155); + int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869); + int16x8_t v1322 = vaddq_s16(v1319, v1321); + int16x8_t v1323 = vsubq_s16(v1159, v1161); + int16x8_t v1324 = vsubq_s16(v1163, v1165); + int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869); + int16x8_t v1326 = vaddq_s16(v1323, v1325); + int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830); + int16x8_t v1328 = vaddq_s16(v1322, v1327); + int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944); + int16x8_t v1330 = vaddq_s16(v1318, v1329); + int16x8_t v1331 = vsubq_s16(v1081, v1083); + int16x8_t v1332 = vsubq_s16(v1085, v1087); + int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552); + int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332); + int16x8_t v1334 = vaddq_s16(v1331, v1333); + int16x8_t v1335 = vsubq_s16(v1091, v1093); + int16x8_t v1336 = vsubq_s16(v1095, v1097); + int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552); + int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336); + int16x8_t v1338 = vaddq_s16(v1335, v1337); + int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393); + int16x8_t v1340 = vaddq_s16(v1334, v1339); + int16x8_t v1341 = vsubq_s16(v1103, v1105); + int16x8_t v1342 = vsubq_s16(v1107, v1109); + int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552); + int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342); + int16x8_t v1344 = vaddq_s16(v1341, v1343); + int16x8_t v1345 = vsubq_s16(v1113, v1115); + int16x8_t v1346 = vsubq_s16(v1117, v1119); + int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552); + int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346); + int16x8_t v1348 = vaddq_s16(v1345, v1347); + int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393); + int16x8_t v1350 = vaddq_s16(v1344, v1349); + int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059); + int16x8_t v1352 = vaddq_s16(v1340, v1351); + int16x8_t v1353 = vsubq_s16(v990, v995); + int16x8_t v1354 = vsubq_s16(v1000, v1005); + int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865); + int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354); + int16x8_t v1356 = vaddq_s16(v1353, v1355); + int16x8_t v1357 = vsubq_s16(v1012, v1017); + int16x8_t v1358 = vsubq_s16(v1022, v1027); + int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865); + int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358); + int16x8_t v1360 = vaddq_s16(v1357, v1359); + int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040); + int16x8_t v1362 = vaddq_s16(v1356, v1361); + int16x8_t v1363 = vsubq_s16(v1036, v1041); + int16x8_t v1364 = vsubq_s16(v1046, v1051); + int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865); + int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364); + int16x8_t v1366 = vaddq_s16(v1363, v1365); + int16x8_t v1367 = vsubq_s16(v1058, v1063); + int16x8_t v1368 = vsubq_s16(v1068, v1073); + int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865); + int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368); + int16x8_t v1370 = vaddq_s16(v1367, v1369); + int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040); + int16x8_t v1372 = vaddq_s16(v1366, v1371); + int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187); + int16x8_t v1374 = vaddq_s16(v1362, v1373); + int16x8_t v1375 = vsubq_s16(v895, v900); + int16x8_t v1376 = vsubq_s16(v905, v910); + int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893); + int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2); + int16x8_t v1378 = vaddq_s16(v1375, v1377); + int16x8_t v1379 = vsubq_s16(v918, v923); + int16x8_t v1380 = vsubq_s16(v928, v933); + int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893); + int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2); + int16x8_t v1382 = vaddq_s16(v1379, v1381); + int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783); + int16x8_t v1384 = vaddq_s16(v1378, v1383); + int16x8_t v1385 = vsubq_s16(v942, v947); + int16x8_t v1386 = vsubq_s16(v952, v957); + int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893); + int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2); + int16x8_t v1388 = vaddq_s16(v1385, v1387); + int16x8_t v1389 = vsubq_s16(v964, v969); + int16x8_t v1390 = vsubq_s16(v974, v979); + int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893); + int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2); + int16x8_t v1392 = vaddq_s16(v1389, v1391); + int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783); + int16x8_t v1394 = vaddq_s16(v1388, v1393); + int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326); + int16x8_t v1396 = vaddq_s16(v1384, v1395); + int16x8_t v1397 = vsubq_s16(v711, v722); + int16x8_t v1398 = vsubq_s16(v733, v744); + int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357); + int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3); + int16x8_t v1400 = vaddq_s16(v1397, v1399); + int16x8_t v1401 = vsubq_s16(v757, v768); + int16x8_t v1402 = vsubq_s16(v779, v790); + int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357); + int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3); + int16x8_t v1404 = vaddq_s16(v1401, v1403); + int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637); + int16x8_t v1406 = vaddq_s16(v1400, v1405); + int16x8_t v1407 = vsubq_s16(v805, v816); + int16x8_t v1408 = vsubq_s16(v827, v838); + int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357); + int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3); + int16x8_t v1410 = vaddq_s16(v1407, v1409); + int16x8_t v1411 = vsubq_s16(v851, v862); + int16x8_t v1412 = vsubq_s16(v873, v884); + int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357); + int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3); + int16x8_t v1414 = vaddq_s16(v1411, v1413); + int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637); + int16x8_t v1416 = vaddq_s16(v1410, v1415); + int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479); + int16x8_t v1418 = vaddq_s16(v1406, v1417); + int16x8_t v1419 = vsubq_s16(v25, v60); + int16x8_t v1420 = vsubq_s16(v102, v138); + int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226); + int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10); + int16x8_t v1422 = vaddq_s16(v1419, v1421); + int16x8_t v1423 = vsubq_s16(v182, v233); + int16x8_t v1424 = vsubq_s16(v275, v312); + int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226); + int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10); + int16x8_t v1426 = vaddq_s16(v1423, v1425); + int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622); + int16x8_t v1428 = vaddq_s16(v1422, v1427); + int16x8_t v1429 = vsubq_s16(v358, v409); + int16x8_t v1430 = vsubq_s16(v481, v519); + int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226); + int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10); + int16x8_t v1432 = vaddq_s16(v1429, v1431); + int16x8_t v1433 = vsubq_s16(v563, v614); + int16x8_t v1434 = vsubq_s16(v656, v694); + int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226); + int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10); + int16x8_t v1436 = vaddq_s16(v1433, v1435); + int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622); + int16x8_t v1438 = vaddq_s16(v1432, v1437); + int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646); + int16x8_t v1440 = vaddq_s16(v1428, v1439); + int16x8_t v1441 = vsubq_s16(v1419, v1421); + int16x8_t v1442 = vsubq_s16(v1423, v1425); + int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761); + int16x8_t v1444 = vaddq_s16(v1441, v1443); + int16x8_t v1445 = vsubq_s16(v1429, v1431); + int16x8_t v1446 = vsubq_s16(v1433, v1435); + int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761); + int16x8_t v1448 = vaddq_s16(v1445, v1447); + int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826); + int16x8_t v1450 = vaddq_s16(v1444, v1449); + int16x8_t v1451 = vsubq_s16(v1397, v1399); + int16x8_t v1452 = vsubq_s16(v1401, v1403); + int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084); + int16x8_t v1454 = vaddq_s16(v1451, v1453); + int16x8_t v1455 = vsubq_s16(v1407, v1409); + int16x8_t v1456 = vsubq_s16(v1411, v1413); + int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084); + int16x8_t v1458 = vaddq_s16(v1455, v1457); + int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021); + int16x8_t v1460 = vaddq_s16(v1454, v1459); + int16x8_t v1461 = vsubq_s16(v1375, v1377); + int16x8_t v1462 = vsubq_s16(v1379, v1381); + int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631); + int16x8_t v1464 = vaddq_s16(v1461, v1463); + int16x8_t v1465 = vsubq_s16(v1385, v1387); + int16x8_t v1466 = vsubq_s16(v1389, v1391); + int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631); + int16x8_t v1468 = vaddq_s16(v1465, v1467); + int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231); + int16x8_t v1470 = vaddq_s16(v1464, v1469); + int16x8_t v1471 = vsubq_s16(v1353, v1355); + int16x8_t v1472 = vsubq_s16(v1357, v1359); + int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454); + int16x8_t v1474 = vaddq_s16(v1471, v1473); + int16x8_t v1475 = vsubq_s16(v1363, v1365); + int16x8_t v1476 = vsubq_s16(v1367, v1369); + int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454); + int16x8_t v1478 = vaddq_s16(v1475, v1477); + int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458); + int16x8_t v1480 = vaddq_s16(v1474, v1479); + int16x8_t v1481 = vsubq_s16(v1331, v1333); + int16x8_t v1482 = vsubq_s16(v1335, v1337); + int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624); + int16x8_t v1484 = vaddq_s16(v1481, v1483); + int16x8_t v1485 = vsubq_s16(v1341, v1343); + int16x8_t v1486 = vsubq_s16(v1345, v1347); + int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624); + int16x8_t v1488 = vaddq_s16(v1485, v1487); + int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702); + int16x8_t v1490 = vaddq_s16(v1484, v1489); + int16x8_t v1491 = vsubq_s16(v1309, v1311); + int16x8_t v1492 = vsubq_s16(v1313, v1315); + int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472); + int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492); + int16x8_t v1494 = vaddq_s16(v1491, v1493); + int16x8_t v1495 = vsubq_s16(v1319, v1321); + int16x8_t v1496 = vsubq_s16(v1323, v1325); + int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472); + int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496); + int16x8_t v1498 = vaddq_s16(v1495, v1497); + int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964); + int16x8_t v1500 = vaddq_s16(v1494, v1499); + int16x8_t v1501 = vsubq_s16(v1287, v1289); + int16x8_t v1502 = vsubq_s16(v1291, v1293); + int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672); + int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502); + int16x8_t v1504 = vaddq_s16(v1501, v1503); + int16x8_t v1505 = vsubq_s16(v1297, v1299); + int16x8_t v1506 = vsubq_s16(v1301, v1303); + int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672); + int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506); + int16x8_t v1508 = vaddq_s16(v1505, v1507); + int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245); + int16x8_t v1510 = vaddq_s16(v1504, v1509); + int16x8_t v1511 = vsubq_s16(v1265, v1267); + int16x8_t v1512 = vsubq_s16(v1269, v1271); + int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662); + int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512); + int16x8_t v1514 = vaddq_s16(v1511, v1513); + int16x8_t v1515 = vsubq_s16(v1275, v1277); + int16x8_t v1516 = vsubq_s16(v1279, v1281); + int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662); + int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516); + int16x8_t v1518 = vaddq_s16(v1515, v1517); + int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546); + int16x8_t v1520 = vaddq_s16(v1514, v1519); + int16x8_t v1521 = vsubq_s16(v1222, v1227); + int16x8_t v1522 = vsubq_s16(v1232, v1237); + int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756); + int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522); + int16x8_t v1524 = vaddq_s16(v1521, v1523); + int16x8_t v1525 = vsubq_s16(v1244, v1249); + int16x8_t v1526 = vsubq_s16(v1254, v1259); + int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756); + int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526); + int16x8_t v1528 = vaddq_s16(v1525, v1527); + int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869); + int16x8_t v1530 = vaddq_s16(v1524, v1529); + int16x8_t v1531 = vsubq_s16(v1176, v1181); + int16x8_t v1532 = vsubq_s16(v1186, v1191); + int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463); + int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532); + int16x8_t v1534 = vaddq_s16(v1531, v1533); + int16x8_t v1535 = vsubq_s16(v1198, v1203); + int16x8_t v1536 = vsubq_s16(v1208, v1213); + int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463); + int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536); + int16x8_t v1538 = vaddq_s16(v1535, v1537); + int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216); + int16x8_t v1540 = vaddq_s16(v1534, v1539); + int16x8_t v1541 = vsubq_s16(v1130, v1135); + int16x8_t v1542 = vsubq_s16(v1140, v1145); + int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661); + int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542); + int16x8_t v1544 = vaddq_s16(v1541, v1543); + int16x8_t v1545 = vsubq_s16(v1152, v1157); + int16x8_t v1546 = vsubq_s16(v1162, v1167); + int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661); + int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546); + int16x8_t v1548 = vaddq_s16(v1545, v1547); + int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587); + int16x8_t v1550 = vaddq_s16(v1544, v1549); + int16x8_t v1551 = vsubq_s16(v1084, v1089); + int16x8_t v1552 = vsubq_s16(v1094, v1099); + int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242); + int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2); + int16x8_t v1554 = vaddq_s16(v1551, v1553); + int16x8_t v1555 = vsubq_s16(v1106, v1111); + int16x8_t v1556 = vsubq_s16(v1116, v1121); + int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242); + int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2); + int16x8_t v1558 = vaddq_s16(v1555, v1557); + int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985); + int16x8_t v1560 = vaddq_s16(v1554, v1559); + int16x8_t v1561 = vsubq_s16(v996, v1007); + int16x8_t v1562 = vsubq_s16(v1018, v1029); + int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298); + int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2); + int16x8_t v1564 = vaddq_s16(v1561, v1563); + int16x8_t v1565 = vsubq_s16(v1042, v1053); + int16x8_t v1566 = vsubq_s16(v1064, v1075); + int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298); + int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2); + int16x8_t v1568 = vaddq_s16(v1565, v1567); + int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412); + int16x8_t v1570 = vaddq_s16(v1564, v1569); + int16x8_t v1571 = vsubq_s16(v901, v912); + int16x8_t v1572 = vsubq_s16(v924, v935); + int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773); + int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4); + int16x8_t v1574 = vaddq_s16(v1571, v1573); + int16x8_t v1575 = vsubq_s16(v948, v959); + int16x8_t v1576 = vsubq_s16(v970, v981); + int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773); + int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4); + int16x8_t v1578 = vaddq_s16(v1575, v1577); + int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871); + int16x8_t v1580 = vaddq_s16(v1574, v1579); + int16x8_t v1581 = vsubq_s16(v723, v746); + int16x8_t v1582 = vsubq_s16(v769, v792); + int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108); + int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6); + int16x8_t v1584 = vaddq_s16(v1581, v1583); + int16x8_t v1585 = vsubq_s16(v817, v840); + int16x8_t v1586 = vsubq_s16(v863, v886); + int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108); + int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6); + int16x8_t v1588 = vaddq_s16(v1585, v1587); + int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363); + int16x8_t v1590 = vaddq_s16(v1584, v1589); + int16x8_t v1591 = vsubq_s16(v61, v140); + int16x8_t v1592 = vsubq_s16(v234, v314); + int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251); + int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20); + int16x8_t v1594 = vaddq_s16(v1591, v1593); + int16x8_t v1595 = vsubq_s16(v410, v521); + int16x8_t v1596 = vsubq_s16(v615, v696); + int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251); + int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20); + int16x8_t v1598 = vaddq_s16(v1595, v1597); + int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891); + int16x8_t v1600 = vaddq_s16(v1594, v1599); + int16x8_t v1601 = vsubq_s16(v1591, v1593); + int16x8_t v1602 = vsubq_s16(v1595, v1597); + int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460); + int16x8_t v1604 = vaddq_s16(v1601, v1603); + int16x8_t v1605 = vsubq_s16(v1581, v1583); + int16x8_t v1606 = vsubq_s16(v1585, v1587); + int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073); + int16x8_t v1608 = vaddq_s16(v1605, v1607); + int16x8_t v1609 = vsubq_s16(v1571, v1573); + int16x8_t v1610 = vsubq_s16(v1575, v1577); + int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734); + int16x8_t v1612 = vaddq_s16(v1609, v1611); + int16x8_t v1613 = vsubq_s16(v1561, v1563); + int16x8_t v1614 = vsubq_s16(v1565, v1567); + int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448); + int16x8_t v1616 = vaddq_s16(v1613, v1615); + int16x8_t v1617 = vsubq_s16(v1551, v1553); + int16x8_t v1618 = vsubq_s16(v1555, v1557); + int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220); + int16x8_t v1620 = vaddq_s16(v1617, v1619); + int16x8_t v1621 = vsubq_s16(v1541, v1543); + int16x8_t v1622 = vsubq_s16(v1545, v1547); + int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058); + int16x8_t v1624 = vaddq_s16(v1621, v1623); + int16x8_t v1625 = vsubq_s16(v1531, v1533); + int16x8_t v1626 = vsubq_s16(v1535, v1537); + int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969); + int16x8_t v1628 = vaddq_s16(v1625, v1627); + int16x8_t v1629 = vsubq_s16(v1521, v1523); + int16x8_t v1630 = vsubq_s16(v1525, v1527); + int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961); + int16x8_t v1632 = vaddq_s16(v1629, v1631); + int16x8_t v1633 = vsubq_s16(v1511, v1513); + int16x8_t v1634 = vsubq_s16(v1515, v1517); + int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044); + int16x8_t v1636 = vaddq_s16(v1633, v1635); + int16x8_t v1637 = vsubq_s16(v1501, v1503); + int16x8_t v1638 = vsubq_s16(v1505, v1507); + int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232); + int16x8_t v1640 = vaddq_s16(v1637, v1639); + int16x8_t v1641 = vsubq_s16(v1491, v1493); + int16x8_t v1642 = vsubq_s16(v1495, v1497); + int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538); + int16x8_t v1644 = vaddq_s16(v1641, v1643); + int16x8_t v1645 = vsubq_s16(v1481, v1483); + int16x8_t v1646 = vsubq_s16(v1485, v1487); + int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211); + int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646); + int16x8_t v1648 = vaddq_s16(v1645, v1647); + int16x8_t v1649 = vsubq_s16(v1471, v1473); + int16x8_t v1650 = vsubq_s16(v1475, v1477); + int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808); + int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650); + int16x8_t v1652 = vaddq_s16(v1649, v1651); + int16x8_t v1653 = vsubq_s16(v1461, v1463); + int16x8_t v1654 = vsubq_s16(v1465, v1467); + int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586); + int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654); + int16x8_t v1656 = vaddq_s16(v1653, v1655); + int16x8_t v1657 = vsubq_s16(v1451, v1453); + int16x8_t v1658 = vsubq_s16(v1455, v1457); + int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576); + int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658); + int16x8_t v1660 = vaddq_s16(v1657, v1659); + int16x8_t v1661 = vsubq_s16(v1441, v1443); + int16x8_t v1662 = vsubq_s16(v1445, v1447); + int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817); + int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662); + int16x8_t v1664 = vaddq_s16(v1661, v1663); + int16x8_t v1665 = vsubq_s16(v1422, v1427); + int16x8_t v1666 = vsubq_s16(v1432, v1437); + int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356); + int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666); + int16x8_t v1668 = vaddq_s16(v1665, v1667); + int16x8_t v1669 = vsubq_s16(v1400, v1405); + int16x8_t v1670 = vsubq_s16(v1410, v1415); + int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256); + int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670); + int16x8_t v1672 = vaddq_s16(v1669, v1671); + int16x8_t v1673 = vsubq_s16(v1378, v1383); + int16x8_t v1674 = vsubq_s16(v1388, v1393); + int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596); + int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674); + int16x8_t v1676 = vaddq_s16(v1673, v1675); + int16x8_t v1677 = vsubq_s16(v1356, v1361); + int16x8_t v1678 = vsubq_s16(v1366, v1371); + int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483); + int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678); + int16x8_t v1680 = vaddq_s16(v1677, v1679); + int16x8_t v1681 = vsubq_s16(v1334, v1339); + int16x8_t v1682 = vsubq_s16(v1344, v1349); + int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057); + int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682); + int16x8_t v1684 = vaddq_s16(v1681, v1683); + int16x8_t v1685 = vsubq_s16(v1312, v1317); + int16x8_t v1686 = vsubq_s16(v1322, v1327); + int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517); + int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686); + int16x8_t v1688 = vaddq_s16(v1685, v1687); + int16x8_t v1689 = vsubq_s16(v1290, v1295); + int16x8_t v1690 = vsubq_s16(v1300, v1305); + int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373); + int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2); + int16x8_t v1692 = vaddq_s16(v1689, v1691); + int16x8_t v1693 = vsubq_s16(v1268, v1273); + int16x8_t v1694 = vsubq_s16(v1278, v1283); + int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571); + int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2); + int16x8_t v1696 = vaddq_s16(v1693, v1695); + int16x8_t v1697 = vsubq_s16(v1228, v1239); + int16x8_t v1698 = vsubq_s16(v1250, v1261); + int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975); + int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2); + int16x8_t v1700 = vaddq_s16(v1697, v1699); + int16x8_t v1701 = vsubq_s16(v1182, v1193); + int16x8_t v1702 = vsubq_s16(v1204, v1215); + int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832); + int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3); + int16x8_t v1704 = vaddq_s16(v1701, v1703); + int16x8_t v1705 = vsubq_s16(v1136, v1147); + int16x8_t v1706 = vsubq_s16(v1158, v1169); + int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437); + int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3); + int16x8_t v1708 = vaddq_s16(v1705, v1707); + int16x8_t v1709 = vsubq_s16(v1090, v1101); + int16x8_t v1710 = vsubq_s16(v1112, v1123); + int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573); + int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4); + int16x8_t v1712 = vaddq_s16(v1709, v1711); + int16x8_t v1713 = vsubq_s16(v1008, v1031); + int16x8_t v1714 = vsubq_s16(v1054, v1077); + int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122); + int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5); + int16x8_t v1716 = vaddq_s16(v1713, v1715); + int16x8_t v1717 = vsubq_s16(v913, v937); + int16x8_t v1718 = vsubq_s16(v960, v983); + int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041); + int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8); + int16x8_t v1720 = vaddq_s16(v1717, v1719); + int16x8_t v1721 = vsubq_s16(v747, v794); + int16x8_t v1722 = vsubq_s16(v841, v888); + int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146); + int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13); + int16x8_t v1724 = vaddq_s16(v1721, v1723); + int16x8_t v1725 = vsubq_s16(v141, v316); + int16x8_t v1726 = vsubq_s16(v522, v698); + int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402); + int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40); + int16x8_t v1728 = vaddq_s16(v1725, v1727); + int16x8_t v1729 = vsubq_s16(v1725, v1727); + int16x8_t v1730 = vsubq_s16(v1721, v1723); + int16x8_t v1731 = vsubq_s16(v1717, v1719); + int16x8_t v1732 = vsubq_s16(v1713, v1715); + int16x8_t v1733 = vsubq_s16(v1709, v1711); + int16x8_t v1734 = vsubq_s16(v1705, v1707); + int16x8_t v1735 = vsubq_s16(v1701, v1703); + int16x8_t v1736 = vsubq_s16(v1697, v1699); + int16x8_t v1737 = vsubq_s16(v1693, v1695); + int16x8_t v1738 = vsubq_s16(v1689, v1691); + int16x8_t v1739 = vsubq_s16(v1685, v1687); + int16x8_t v1740 = vsubq_s16(v1681, v1683); + int16x8_t v1741 = vsubq_s16(v1677, v1679); + int16x8_t v1742 = vsubq_s16(v1673, v1675); + int16x8_t v1743 = vsubq_s16(v1669, v1671); + int16x8_t v1744 = vsubq_s16(v1665, v1667); + int16x8_t v1745 = vsubq_s16(v1661, v1663); + int16x8_t v1746 = vsubq_s16(v1657, v1659); + int16x8_t v1747 = vsubq_s16(v1653, v1655); + int16x8_t v1748 = vsubq_s16(v1649, v1651); + int16x8_t v1749 = vsubq_s16(v1645, v1647); + int16x8_t v1750 = vsubq_s16(v1641, v1643); + int16x8_t v1751 = vsubq_s16(v1637, v1639); + int16x8_t v1752 = vsubq_s16(v1633, v1635); + int16x8_t v1753 = vsubq_s16(v1629, v1631); + int16x8_t v1754 = vsubq_s16(v1625, v1627); + int16x8_t v1755 = vsubq_s16(v1621, v1623); + int16x8_t v1756 = vsubq_s16(v1617, v1619); + int16x8_t v1757 = vsubq_s16(v1613, v1615); + int16x8_t v1758 = vsubq_s16(v1609, v1611); + int16x8_t v1759 = vsubq_s16(v1605, v1607); + int16x8_t v1760 = vsubq_s16(v1601, v1603); + int16x8_t v1761 = vsubq_s16(v1594, v1599); + int16x8_t v1762 = vsubq_s16(v1584, v1589); + int16x8_t v1763 = vsubq_s16(v1574, v1579); + int16x8_t v1764 = vsubq_s16(v1564, v1569); + int16x8_t v1765 = vsubq_s16(v1554, v1559); + int16x8_t v1766 = vsubq_s16(v1544, v1549); + int16x8_t v1767 = vsubq_s16(v1534, v1539); + int16x8_t v1768 = vsubq_s16(v1524, v1529); + int16x8_t v1769 = vsubq_s16(v1514, v1519); + int16x8_t v1770 = vsubq_s16(v1504, v1509); + int16x8_t v1771 = vsubq_s16(v1494, v1499); + int16x8_t v1772 = vsubq_s16(v1484, v1489); + int16x8_t v1773 = vsubq_s16(v1474, v1479); + int16x8_t v1774 = vsubq_s16(v1464, v1469); + int16x8_t v1775 = vsubq_s16(v1454, v1459); + int16x8_t v1776 = vsubq_s16(v1444, v1449); + int16x8_t v1777 = vsubq_s16(v1428, v1439); + int16x8_t v1778 = vsubq_s16(v1406, v1417); + int16x8_t v1779 = vsubq_s16(v1384, v1395); + int16x8_t v1780 = vsubq_s16(v1362, v1373); + int16x8_t v1781 = vsubq_s16(v1340, v1351); + int16x8_t v1782 = vsubq_s16(v1318, v1329); + int16x8_t v1783 = vsubq_s16(v1296, v1307); + int16x8_t v1784 = vsubq_s16(v1274, v1285); + int16x8_t v1785 = vsubq_s16(v1240, v1263); + int16x8_t v1786 = vsubq_s16(v1194, v1217); + int16x8_t v1787 = vsubq_s16(v1148, v1171); + int16x8_t v1788 = vsubq_s16(v1102, v1125); + int16x8_t v1789 = vsubq_s16(v1032, v1079); + int16x8_t v1790 = vsubq_s16(v938, v985); + int16x8_t v1791 = vsubq_s16(v795, v890); + int16x8_t v1792 = vsubq_s16(v317, v700); + vst1q_s16(out + out_stride * 0 + i, v701); + vst1q_s16(out + out_stride * 1 + i, v891); + vst1q_s16(out + out_stride * 2 + i, v986); + vst1q_s16(out + out_stride * 3 + i, v1080); + vst1q_s16(out + out_stride * 4 + i, v1126); + vst1q_s16(out + out_stride * 5 + i, v1172); + vst1q_s16(out + out_stride * 6 + i, v1218); + vst1q_s16(out + out_stride * 7 + i, v1264); + vst1q_s16(out + out_stride * 8 + i, v1286); + vst1q_s16(out + out_stride * 9 + i, v1308); + vst1q_s16(out + out_stride * 10 + i, v1330); + vst1q_s16(out + out_stride * 11 + i, v1352); + vst1q_s16(out + out_stride * 12 + i, v1374); + vst1q_s16(out + out_stride * 13 + i, v1396); + vst1q_s16(out + out_stride * 14 + i, v1418); + vst1q_s16(out + out_stride * 15 + i, v1440); + vst1q_s16(out + out_stride * 16 + i, v1450); + vst1q_s16(out + out_stride * 17 + i, v1460); + vst1q_s16(out + out_stride * 18 + i, v1470); + vst1q_s16(out + out_stride * 19 + i, v1480); + vst1q_s16(out + out_stride * 20 + i, v1490); + vst1q_s16(out + out_stride * 21 + i, v1500); + vst1q_s16(out + out_stride * 22 + i, v1510); + vst1q_s16(out + out_stride * 23 + i, v1520); + vst1q_s16(out + out_stride * 24 + i, v1530); + vst1q_s16(out + out_stride * 25 + i, v1540); + vst1q_s16(out + out_stride * 26 + i, v1550); + vst1q_s16(out + out_stride * 27 + i, v1560); + vst1q_s16(out + out_stride * 28 + i, v1570); + vst1q_s16(out + out_stride * 29 + i, v1580); + vst1q_s16(out + out_stride * 30 + i, v1590); + vst1q_s16(out + out_stride * 31 + i, v1600); + vst1q_s16(out + out_stride * 32 + i, v1604); + vst1q_s16(out + out_stride * 33 + i, v1608); + vst1q_s16(out + out_stride * 34 + i, v1612); + vst1q_s16(out + out_stride * 35 + i, v1616); + vst1q_s16(out + out_stride * 36 + i, v1620); + vst1q_s16(out + out_stride * 37 + i, v1624); + vst1q_s16(out + out_stride * 38 + i, v1628); + vst1q_s16(out + out_stride * 39 + i, v1632); + vst1q_s16(out + out_stride * 40 + i, v1636); + vst1q_s16(out + out_stride * 41 + i, v1640); + vst1q_s16(out + out_stride * 42 + i, v1644); + vst1q_s16(out + out_stride * 43 + i, v1648); + vst1q_s16(out + out_stride * 44 + i, v1652); + vst1q_s16(out + out_stride * 45 + i, v1656); + vst1q_s16(out + out_stride * 46 + i, v1660); + vst1q_s16(out + out_stride * 47 + i, v1664); + vst1q_s16(out + out_stride * 48 + i, v1668); + vst1q_s16(out + out_stride * 49 + i, v1672); + vst1q_s16(out + out_stride * 50 + i, v1676); + vst1q_s16(out + out_stride * 51 + i, v1680); + vst1q_s16(out + out_stride * 52 + i, v1684); + vst1q_s16(out + out_stride * 53 + i, v1688); + vst1q_s16(out + out_stride * 54 + i, v1692); + vst1q_s16(out + out_stride * 55 + i, v1696); + vst1q_s16(out + out_stride * 56 + i, v1700); + vst1q_s16(out + out_stride * 57 + i, v1704); + vst1q_s16(out + out_stride * 58 + i, v1708); + vst1q_s16(out + out_stride * 59 + i, v1712); + vst1q_s16(out + out_stride * 60 + i, v1716); + vst1q_s16(out + out_stride * 61 + i, v1720); + vst1q_s16(out + out_stride * 62 + i, v1724); + vst1q_s16(out + out_stride * 63 + i, v1728); + vst1q_s16(out + out_stride * 64 + i, v1729); + vst1q_s16(out + out_stride * 65 + i, v1730); + vst1q_s16(out + out_stride * 66 + i, v1731); + vst1q_s16(out + out_stride * 67 + i, v1732); + vst1q_s16(out + out_stride * 68 + i, v1733); + vst1q_s16(out + out_stride * 69 + i, v1734); + vst1q_s16(out + out_stride * 70 + i, v1735); + vst1q_s16(out + out_stride * 71 + i, v1736); + vst1q_s16(out + out_stride * 72 + i, v1737); + vst1q_s16(out + out_stride * 73 + i, v1738); + vst1q_s16(out + out_stride * 74 + i, v1739); + vst1q_s16(out + out_stride * 75 + i, v1740); + vst1q_s16(out + out_stride * 76 + i, v1741); + vst1q_s16(out + out_stride * 77 + i, v1742); + vst1q_s16(out + out_stride * 78 + i, v1743); + vst1q_s16(out + out_stride * 79 + i, v1744); + vst1q_s16(out + out_stride * 80 + i, v1745); + vst1q_s16(out + out_stride * 81 + i, v1746); + vst1q_s16(out + out_stride * 82 + i, v1747); + vst1q_s16(out + out_stride * 83 + i, v1748); + vst1q_s16(out + out_stride * 84 + i, v1749); + vst1q_s16(out + out_stride * 85 + i, v1750); + vst1q_s16(out + out_stride * 86 + i, v1751); + vst1q_s16(out + out_stride * 87 + i, v1752); + vst1q_s16(out + out_stride * 88 + i, v1753); + vst1q_s16(out + out_stride * 89 + i, v1754); + vst1q_s16(out + out_stride * 90 + i, v1755); + vst1q_s16(out + out_stride * 91 + i, v1756); + vst1q_s16(out + out_stride * 92 + i, v1757); + vst1q_s16(out + out_stride * 93 + i, v1758); + vst1q_s16(out + out_stride * 94 + i, v1759); + vst1q_s16(out + out_stride * 95 + i, v1760); + vst1q_s16(out + out_stride * 96 + i, v1761); + vst1q_s16(out + out_stride * 97 + i, v1762); + vst1q_s16(out + out_stride * 98 + i, v1763); + vst1q_s16(out + out_stride * 99 + i, v1764); + vst1q_s16(out + out_stride * 100 + i, v1765); + vst1q_s16(out + out_stride * 101 + i, v1766); + vst1q_s16(out + out_stride * 102 + i, v1767); + vst1q_s16(out + out_stride * 103 + i, v1768); + vst1q_s16(out + out_stride * 104 + i, v1769); + vst1q_s16(out + out_stride * 105 + i, v1770); + vst1q_s16(out + out_stride * 106 + i, v1771); + vst1q_s16(out + out_stride * 107 + i, v1772); + vst1q_s16(out + out_stride * 108 + i, v1773); + vst1q_s16(out + out_stride * 109 + i, v1774); + vst1q_s16(out + out_stride * 110 + i, v1775); + vst1q_s16(out + out_stride * 111 + i, v1776); + vst1q_s16(out + out_stride * 112 + i, v1777); + vst1q_s16(out + out_stride * 113 + i, v1778); + vst1q_s16(out + out_stride * 114 + i, v1779); + vst1q_s16(out + out_stride * 115 + i, v1780); + vst1q_s16(out + out_stride * 116 + i, v1781); + vst1q_s16(out + out_stride * 117 + i, v1782); + vst1q_s16(out + out_stride * 118 + i, v1783); + vst1q_s16(out + out_stride * 119 + i, v1784); + vst1q_s16(out + out_stride * 120 + i, v1785); + vst1q_s16(out + out_stride * 121 + i, v1786); + vst1q_s16(out + out_stride * 122 + i, v1787); + vst1q_s16(out + out_stride * 123 + i, v1788); + vst1q_s16(out + out_stride * 124 + i, v1789); + vst1q_s16(out + out_stride * 125 + i, v1790); + vst1q_s16(out + out_stride * 126 + i, v1791); + vst1q_s16(out + out_stride * 127 + i, v1792); + } +} |