// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. /* This file is automatically generated. Do not modify it directly. */ #if HWY_TARGET != HWY_NEON #error "only include this file from fast_dct-inl.h" #endif constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; } void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride, int16_t* out, size_t out_stride, size_t count) { JXL_ASSERT(count % 8 == 0); for (size_t i = 0; i < count; i += 8) { int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i); int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i); int16x8_t v2 = vaddq_s16(v0, v1); int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i); int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573); int16x8_t v4 = vaddq_s16(v4_tmp, v3); int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i); int16x8_t v6 = vaddq_s16(v5, v3); int16x8_t v7 = vaddq_s16(v4, v6); int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734); int16x8_t v9 = vaddq_s16(v2, v8); int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i); int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573); int16x8_t v11 = vaddq_s16(v11_tmp, v10); int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i); int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i); int16x8_t v14 = vaddq_s16(v12, v13); int16x8_t v15 = vaddq_s16(v11, v14); int16x8_t v16 = vaddq_s16(v13, v10); int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573); int16x8_t v17 = vaddq_s16(v17_tmp, v16); int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i); int16x8_t v19 = vaddq_s16(v18, v12); int16x8_t v20 = vaddq_s16(v19, v16); int16x8_t v21 = vaddq_s16(v17, v20); int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734); int16x8_t v23 = vaddq_s16(v15, v22); int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705); int16x8_t v25 = vaddq_s16(v9, v24); int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i); int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573); int16x8_t v27 = vaddq_s16(v27_tmp, v26); int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i); int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i); int16x8_t v30 = vaddq_s16(v28, v29); int16x8_t v31 = vaddq_s16(v27, v30); int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i); int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i); int16x8_t v34 = vaddq_s16(v32, v33); int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573); int16x8_t v35 = vaddq_s16(v35_tmp, v34); int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i); int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i); int16x8_t v38 = vaddq_s16(v36, v37); int16x8_t v39 = vaddq_s16(v38, v34); int16x8_t v40 = vaddq_s16(v35, v39); int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734); int16x8_t v42 = vaddq_s16(v31, v41); int16x8_t v43 = vaddq_s16(v33, v26); int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573); int16x8_t v44 = vaddq_s16(v44_tmp, v43); int16x8_t v45 = vaddq_s16(v37, v28); int16x8_t v46 = vaddq_s16(v29, v32); int16x8_t v47 = vaddq_s16(v45, v46); int16x8_t v48 = vaddq_s16(v44, v47); int16x8_t v49 = vaddq_s16(v46, v43); int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573); int16x8_t v50 = vaddq_s16(v50_tmp, v49); int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i); int16x8_t v52 = vaddq_s16(v51, v36); int16x8_t v53 = vaddq_s16(v52, v45); int16x8_t v54 = vaddq_s16(v53, v49); int16x8_t v55 = vaddq_s16(v50, v54); int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734); int16x8_t v57 = vaddq_s16(v48, v56); int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705); int16x8_t v59 = vaddq_s16(v42, v58); int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463); int16x8_t v61 = vaddq_s16(v25, v60); int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i); int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573); int16x8_t v63 = vaddq_s16(v63_tmp, v62); int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i); int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i); int16x8_t v66 = vaddq_s16(v64, v65); int16x8_t v67 = vaddq_s16(v63, v66); int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i); int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i); int16x8_t v70 = vaddq_s16(v68, v69); int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573); int16x8_t v71 = vaddq_s16(v71_tmp, v70); int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i); int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i); int16x8_t v74 = vaddq_s16(v72, v73); int16x8_t v75 = vaddq_s16(v74, v70); int16x8_t v76 = vaddq_s16(v71, v75); int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734); int16x8_t v78 = vaddq_s16(v67, v77); int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i); int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i); int16x8_t v81 = vaddq_s16(v79, v80); int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573); int16x8_t v82 = vaddq_s16(v82_tmp, v81); int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i); int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i); int16x8_t v85 = vaddq_s16(v83, v84); int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i); int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i); int16x8_t v88 = vaddq_s16(v86, v87); int16x8_t v89 = vaddq_s16(v85, v88); int16x8_t v90 = vaddq_s16(v82, v89); int16x8_t v91 = vaddq_s16(v88, v81); int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573); int16x8_t v92 = vaddq_s16(v92_tmp, v91); int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i); int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i); int16x8_t v95 = vaddq_s16(v93, v94); int16x8_t v96 = vaddq_s16(v95, v85); int16x8_t v97 = vaddq_s16(v96, v91); int16x8_t v98 = vaddq_s16(v92, v97); int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734); int16x8_t v100 = vaddq_s16(v90, v99); int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705); int16x8_t v102 = vaddq_s16(v78, v101); int16x8_t v103 = vaddq_s16(v80, v62); int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573); int16x8_t v104 = vaddq_s16(v104_tmp, v103); int16x8_t v105 = vaddq_s16(v84, v64); int16x8_t v106 = vaddq_s16(v65, v86); int16x8_t v107 = vaddq_s16(v105, v106); int16x8_t v108 = vaddq_s16(v104, v107); int16x8_t v109 = vaddq_s16(v87, v68); int16x8_t v110 = vaddq_s16(v69, v79); int16x8_t v111 = vaddq_s16(v109, v110); int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573); int16x8_t v112 = vaddq_s16(v112_tmp, v111); int16x8_t v113 = vaddq_s16(v94, v72); int16x8_t v114 = vaddq_s16(v73, v83); int16x8_t v115 = vaddq_s16(v113, v114); int16x8_t v116 = vaddq_s16(v115, v111); int16x8_t v117 = vaddq_s16(v112, v116); int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734); int16x8_t v119 = vaddq_s16(v108, v118); int16x8_t v120 = vaddq_s16(v110, v103); int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573); int16x8_t v121 = vaddq_s16(v121_tmp, v120); int16x8_t v122 = vaddq_s16(v114, v105); int16x8_t v123 = vaddq_s16(v106, v109); int16x8_t v124 = vaddq_s16(v122, v123); int16x8_t v125 = vaddq_s16(v121, v124); int16x8_t v126 = vaddq_s16(v123, v120); int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573); int16x8_t v127 = vaddq_s16(v127_tmp, v126); int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i); int16x8_t v129 = vaddq_s16(v128, v93); int16x8_t v130 = vaddq_s16(v129, v113); int16x8_t v131 = vaddq_s16(v130, v122); int16x8_t v132 = vaddq_s16(v131, v126); int16x8_t v133 = vaddq_s16(v127, v132); int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734); int16x8_t v135 = vaddq_s16(v125, v134); int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705); int16x8_t v137 = vaddq_s16(v119, v136); int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463); int16x8_t v139 = vaddq_s16(v102, v138); int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404); int16x8_t v141 = vaddq_s16(v61, v140); int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i); int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573); int16x8_t v143 = vaddq_s16(v143_tmp, v142); int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i); int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i); int16x8_t v146 = vaddq_s16(v144, v145); int16x8_t v147 = vaddq_s16(v143, v146); int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i); int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i); int16x8_t v150 = vaddq_s16(v148, v149); int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573); int16x8_t v151 = vaddq_s16(v151_tmp, v150); int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i); int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i); int16x8_t v154 = vaddq_s16(v152, v153); int16x8_t v155 = vaddq_s16(v154, v150); int16x8_t v156 = vaddq_s16(v151, v155); int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734); int16x8_t v158 = vaddq_s16(v147, v157); int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i); int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i); int16x8_t v161 = vaddq_s16(v159, v160); int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573); int16x8_t v162 = vaddq_s16(v162_tmp, v161); int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i); int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i); int16x8_t v165 = vaddq_s16(v163, v164); int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i); int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i); int16x8_t v168 = vaddq_s16(v166, v167); int16x8_t v169 = vaddq_s16(v165, v168); int16x8_t v170 = vaddq_s16(v162, v169); int16x8_t v171 = vaddq_s16(v168, v161); int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573); int16x8_t v172 = vaddq_s16(v172_tmp, v171); int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i); int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i); int16x8_t v175 = vaddq_s16(v173, v174); int16x8_t v176 = vaddq_s16(v175, v165); int16x8_t v177 = vaddq_s16(v176, v171); int16x8_t v178 = vaddq_s16(v172, v177); int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734); int16x8_t v180 = vaddq_s16(v170, v179); int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705); int16x8_t v182 = vaddq_s16(v158, v181); int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i); int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i); int16x8_t v185 = vaddq_s16(v183, v184); int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573); int16x8_t v186 = vaddq_s16(v186_tmp, v185); int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i); int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i); int16x8_t v189 = vaddq_s16(v187, v188); int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i); int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i); int16x8_t v192 = vaddq_s16(v190, v191); int16x8_t v193 = vaddq_s16(v189, v192); int16x8_t v194 = vaddq_s16(v186, v193); int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i); int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i); int16x8_t v197 = vaddq_s16(v195, v196); int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i); int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i); int16x8_t v200 = vaddq_s16(v198, v199); int16x8_t v201 = vaddq_s16(v197, v200); int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573); int16x8_t v202 = vaddq_s16(v202_tmp, v201); int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i); int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i); int16x8_t v205 = vaddq_s16(v203, v204); int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i); int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i); int16x8_t v208 = vaddq_s16(v206, v207); int16x8_t v209 = vaddq_s16(v205, v208); int16x8_t v210 = vaddq_s16(v209, v201); int16x8_t v211 = vaddq_s16(v202, v210); int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734); int16x8_t v213 = vaddq_s16(v194, v212); int16x8_t v214 = vaddq_s16(v200, v185); int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573); int16x8_t v215 = vaddq_s16(v215_tmp, v214); int16x8_t v216 = vaddq_s16(v208, v189); int16x8_t v217 = vaddq_s16(v192, v197); int16x8_t v218 = vaddq_s16(v216, v217); int16x8_t v219 = vaddq_s16(v215, v218); int16x8_t v220 = vaddq_s16(v217, v214); int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573); int16x8_t v221 = vaddq_s16(v221_tmp, v220); int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i); int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i); int16x8_t v224 = vaddq_s16(v222, v223); int16x8_t v225 = vaddq_s16(v224, v205); int16x8_t v226 = vaddq_s16(v225, v216); int16x8_t v227 = vaddq_s16(v226, v220); int16x8_t v228 = vaddq_s16(v221, v227); int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734); int16x8_t v230 = vaddq_s16(v219, v229); int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705); int16x8_t v232 = vaddq_s16(v213, v231); int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463); int16x8_t v234 = vaddq_s16(v182, v233); int16x8_t v235 = vaddq_s16(v184, v142); int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573); int16x8_t v236 = vaddq_s16(v236_tmp, v235); int16x8_t v237 = vaddq_s16(v188, v144); int16x8_t v238 = vaddq_s16(v145, v190); int16x8_t v239 = vaddq_s16(v237, v238); int16x8_t v240 = vaddq_s16(v236, v239); int16x8_t v241 = vaddq_s16(v196, v148); int16x8_t v242 = vaddq_s16(v149, v198); int16x8_t v243 = vaddq_s16(v241, v242); int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573); int16x8_t v244 = vaddq_s16(v244_tmp, v243); int16x8_t v245 = vaddq_s16(v204, v152); int16x8_t v246 = vaddq_s16(v153, v206); int16x8_t v247 = vaddq_s16(v245, v246); int16x8_t v248 = vaddq_s16(v247, v243); int16x8_t v249 = vaddq_s16(v244, v248); int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734); int16x8_t v251 = vaddq_s16(v240, v250); int16x8_t v252 = vaddq_s16(v199, v159); int16x8_t v253 = vaddq_s16(v160, v183); int16x8_t v254 = vaddq_s16(v252, v253); int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573); int16x8_t v255 = vaddq_s16(v255_tmp, v254); int16x8_t v256 = vaddq_s16(v207, v163); int16x8_t v257 = vaddq_s16(v164, v187); int16x8_t v258 = vaddq_s16(v256, v257); int16x8_t v259 = vaddq_s16(v191, v166); int16x8_t v260 = vaddq_s16(v167, v195); int16x8_t v261 = vaddq_s16(v259, v260); int16x8_t v262 = vaddq_s16(v258, v261); int16x8_t v263 = vaddq_s16(v255, v262); int16x8_t v264 = vaddq_s16(v261, v254); int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573); int16x8_t v265 = vaddq_s16(v265_tmp, v264); int16x8_t v266 = vaddq_s16(v223, v173); int16x8_t v267 = vaddq_s16(v174, v203); int16x8_t v268 = vaddq_s16(v266, v267); int16x8_t v269 = vaddq_s16(v268, v258); int16x8_t v270 = vaddq_s16(v269, v264); int16x8_t v271 = vaddq_s16(v265, v270); int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734); int16x8_t v273 = vaddq_s16(v263, v272); int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705); int16x8_t v275 = vaddq_s16(v251, v274); int16x8_t v276 = vaddq_s16(v253, v235); int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573); int16x8_t v277 = vaddq_s16(v277_tmp, v276); int16x8_t v278 = vaddq_s16(v257, v237); int16x8_t v279 = vaddq_s16(v238, v259); int16x8_t v280 = vaddq_s16(v278, v279); int16x8_t v281 = vaddq_s16(v277, v280); int16x8_t v282 = vaddq_s16(v260, v241); int16x8_t v283 = vaddq_s16(v242, v252); int16x8_t v284 = vaddq_s16(v282, v283); int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573); int16x8_t v285 = vaddq_s16(v285_tmp, v284); int16x8_t v286 = vaddq_s16(v267, v245); int16x8_t v287 = vaddq_s16(v246, v256); int16x8_t v288 = vaddq_s16(v286, v287); int16x8_t v289 = vaddq_s16(v288, v284); int16x8_t v290 = vaddq_s16(v285, v289); int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734); int16x8_t v292 = vaddq_s16(v281, v291); int16x8_t v293 = vaddq_s16(v283, v276); int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573); int16x8_t v294 = vaddq_s16(v294_tmp, v293); int16x8_t v295 = vaddq_s16(v287, v278); int16x8_t v296 = vaddq_s16(v279, v282); int16x8_t v297 = vaddq_s16(v295, v296); int16x8_t v298 = vaddq_s16(v294, v297); int16x8_t v299 = vaddq_s16(v296, v293); int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573); int16x8_t v300 = vaddq_s16(v300_tmp, v299); int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i); int16x8_t v302 = vaddq_s16(v301, v222); int16x8_t v303 = vaddq_s16(v302, v266); int16x8_t v304 = vaddq_s16(v303, v286); int16x8_t v305 = vaddq_s16(v304, v295); int16x8_t v306 = vaddq_s16(v305, v299); int16x8_t v307 = vaddq_s16(v300, v306); int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734); int16x8_t v309 = vaddq_s16(v298, v308); int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705); int16x8_t v311 = vaddq_s16(v292, v310); int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463); int16x8_t v313 = vaddq_s16(v275, v312); int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404); int16x8_t v315 = vaddq_s16(v234, v314); int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389); int16x8_t v317 = vaddq_s16(v141, v316); int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i); int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573); int16x8_t v319 = vaddq_s16(v319_tmp, v318); int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i); int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i); int16x8_t v322 = vaddq_s16(v320, v321); int16x8_t v323 = vaddq_s16(v319, v322); int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i); int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i); int16x8_t v326 = vaddq_s16(v324, v325); int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573); int16x8_t v327 = vaddq_s16(v327_tmp, v326); int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i); int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i); int16x8_t v330 = vaddq_s16(v328, v329); int16x8_t v331 = vaddq_s16(v330, v326); int16x8_t v332 = vaddq_s16(v327, v331); int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734); int16x8_t v334 = vaddq_s16(v323, v333); int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i); int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i); int16x8_t v337 = vaddq_s16(v335, v336); int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573); int16x8_t v338 = vaddq_s16(v338_tmp, v337); int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i); int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i); int16x8_t v341 = vaddq_s16(v339, v340); int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i); int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i); int16x8_t v344 = vaddq_s16(v342, v343); int16x8_t v345 = vaddq_s16(v341, v344); int16x8_t v346 = vaddq_s16(v338, v345); int16x8_t v347 = vaddq_s16(v344, v337); int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573); int16x8_t v348 = vaddq_s16(v348_tmp, v347); int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i); int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i); int16x8_t v351 = vaddq_s16(v349, v350); int16x8_t v352 = vaddq_s16(v351, v341); int16x8_t v353 = vaddq_s16(v352, v347); int16x8_t v354 = vaddq_s16(v348, v353); int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734); int16x8_t v356 = vaddq_s16(v346, v355); int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705); int16x8_t v358 = vaddq_s16(v334, v357); int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i); int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i); int16x8_t v361 = vaddq_s16(v359, v360); int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573); int16x8_t v362 = vaddq_s16(v362_tmp, v361); int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i); int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i); int16x8_t v365 = vaddq_s16(v363, v364); int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i); int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i); int16x8_t v368 = vaddq_s16(v366, v367); int16x8_t v369 = vaddq_s16(v365, v368); int16x8_t v370 = vaddq_s16(v362, v369); int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i); int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i); int16x8_t v373 = vaddq_s16(v371, v372); int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i); int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i); int16x8_t v376 = vaddq_s16(v374, v375); int16x8_t v377 = vaddq_s16(v373, v376); int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573); int16x8_t v378 = vaddq_s16(v378_tmp, v377); int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i); int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i); int16x8_t v381 = vaddq_s16(v379, v380); int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i); int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i); int16x8_t v384 = vaddq_s16(v382, v383); int16x8_t v385 = vaddq_s16(v381, v384); int16x8_t v386 = vaddq_s16(v385, v377); int16x8_t v387 = vaddq_s16(v378, v386); int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734); int16x8_t v389 = vaddq_s16(v370, v388); int16x8_t v390 = vaddq_s16(v376, v361); int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573); int16x8_t v391 = vaddq_s16(v391_tmp, v390); int16x8_t v392 = vaddq_s16(v384, v365); int16x8_t v393 = vaddq_s16(v368, v373); int16x8_t v394 = vaddq_s16(v392, v393); int16x8_t v395 = vaddq_s16(v391, v394); int16x8_t v396 = vaddq_s16(v393, v390); int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573); int16x8_t v397 = vaddq_s16(v397_tmp, v396); int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i); int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i); int16x8_t v400 = vaddq_s16(v398, v399); int16x8_t v401 = vaddq_s16(v400, v381); int16x8_t v402 = vaddq_s16(v401, v392); int16x8_t v403 = vaddq_s16(v402, v396); int16x8_t v404 = vaddq_s16(v397, v403); int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734); int16x8_t v406 = vaddq_s16(v395, v405); int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705); int16x8_t v408 = vaddq_s16(v389, v407); int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463); int16x8_t v410 = vaddq_s16(v358, v409); int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i); int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i); int16x8_t v413 = vaddq_s16(v411, v412); int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573); int16x8_t v414 = vaddq_s16(v414_tmp, v413); int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i); int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i); int16x8_t v417 = vaddq_s16(v415, v416); int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i); int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i); int16x8_t v420 = vaddq_s16(v418, v419); int16x8_t v421 = vaddq_s16(v417, v420); int16x8_t v422 = vaddq_s16(v414, v421); int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i); int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i); int16x8_t v425 = vaddq_s16(v423, v424); int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i); int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i); int16x8_t v428 = vaddq_s16(v426, v427); int16x8_t v429 = vaddq_s16(v425, v428); int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573); int16x8_t v430 = vaddq_s16(v430_tmp, v429); int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i); int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i); int16x8_t v433 = vaddq_s16(v431, v432); int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i); int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i); int16x8_t v436 = vaddq_s16(v434, v435); int16x8_t v437 = vaddq_s16(v433, v436); int16x8_t v438 = vaddq_s16(v437, v429); int16x8_t v439 = vaddq_s16(v430, v438); int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734); int16x8_t v441 = vaddq_s16(v422, v440); int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i); int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i); int16x8_t v444 = vaddq_s16(v442, v443); int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i); int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i); int16x8_t v447 = vaddq_s16(v445, v446); int16x8_t v448 = vaddq_s16(v444, v447); int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573); int16x8_t v449 = vaddq_s16(v449_tmp, v448); int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i); int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i); int16x8_t v452 = vaddq_s16(v450, v451); int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i); int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i); int16x8_t v455 = vaddq_s16(v453, v454); int16x8_t v456 = vaddq_s16(v452, v455); int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i); int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i); int16x8_t v459 = vaddq_s16(v457, v458); int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i); int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i); int16x8_t v462 = vaddq_s16(v460, v461); int16x8_t v463 = vaddq_s16(v459, v462); int16x8_t v464 = vaddq_s16(v456, v463); int16x8_t v465 = vaddq_s16(v449, v464); int16x8_t v466 = vaddq_s16(v463, v448); int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573); int16x8_t v467 = vaddq_s16(v467_tmp, v466); int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i); int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i); int16x8_t v470 = vaddq_s16(v468, v469); int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i); int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i); int16x8_t v473 = vaddq_s16(v471, v472); int16x8_t v474 = vaddq_s16(v470, v473); int16x8_t v475 = vaddq_s16(v474, v456); int16x8_t v476 = vaddq_s16(v475, v466); int16x8_t v477 = vaddq_s16(v467, v476); int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734); int16x8_t v479 = vaddq_s16(v465, v478); int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705); int16x8_t v481 = vaddq_s16(v441, v480); int16x8_t v482 = vaddq_s16(v447, v413); int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573); int16x8_t v483 = vaddq_s16(v483_tmp, v482); int16x8_t v484 = vaddq_s16(v455, v417); int16x8_t v485 = vaddq_s16(v420, v459); int16x8_t v486 = vaddq_s16(v484, v485); int16x8_t v487 = vaddq_s16(v483, v486); int16x8_t v488 = vaddq_s16(v462, v425); int16x8_t v489 = vaddq_s16(v428, v444); int16x8_t v490 = vaddq_s16(v488, v489); int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573); int16x8_t v491 = vaddq_s16(v491_tmp, v490); int16x8_t v492 = vaddq_s16(v473, v433); int16x8_t v493 = vaddq_s16(v436, v452); int16x8_t v494 = vaddq_s16(v492, v493); int16x8_t v495 = vaddq_s16(v494, v490); int16x8_t v496 = vaddq_s16(v491, v495); int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734); int16x8_t v498 = vaddq_s16(v487, v497); int16x8_t v499 = vaddq_s16(v489, v482); int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573); int16x8_t v500 = vaddq_s16(v500_tmp, v499); int16x8_t v501 = vaddq_s16(v493, v484); int16x8_t v502 = vaddq_s16(v485, v488); int16x8_t v503 = vaddq_s16(v501, v502); int16x8_t v504 = vaddq_s16(v500, v503); int16x8_t v505 = vaddq_s16(v502, v499); int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573); int16x8_t v506 = vaddq_s16(v506_tmp, v505); int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i); int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i); int16x8_t v509 = vaddq_s16(v507, v508); int16x8_t v510 = vaddq_s16(v509, v470); int16x8_t v511 = vaddq_s16(v510, v492); int16x8_t v512 = vaddq_s16(v511, v501); int16x8_t v513 = vaddq_s16(v512, v505); int16x8_t v514 = vaddq_s16(v506, v513); int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734); int16x8_t v516 = vaddq_s16(v504, v515); int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705); int16x8_t v518 = vaddq_s16(v498, v517); int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463); int16x8_t v520 = vaddq_s16(v481, v519); int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404); int16x8_t v522 = vaddq_s16(v410, v521); int16x8_t v523 = vaddq_s16(v412, v318); int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573); int16x8_t v524 = vaddq_s16(v524_tmp, v523); int16x8_t v525 = vaddq_s16(v416, v320); int16x8_t v526 = vaddq_s16(v321, v418); int16x8_t v527 = vaddq_s16(v525, v526); int16x8_t v528 = vaddq_s16(v524, v527); int16x8_t v529 = vaddq_s16(v424, v324); int16x8_t v530 = vaddq_s16(v325, v426); int16x8_t v531 = vaddq_s16(v529, v530); int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573); int16x8_t v532 = vaddq_s16(v532_tmp, v531); int16x8_t v533 = vaddq_s16(v432, v328); int16x8_t v534 = vaddq_s16(v329, v434); int16x8_t v535 = vaddq_s16(v533, v534); int16x8_t v536 = vaddq_s16(v535, v531); int16x8_t v537 = vaddq_s16(v532, v536); int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734); int16x8_t v539 = vaddq_s16(v528, v538); int16x8_t v540 = vaddq_s16(v443, v335); int16x8_t v541 = vaddq_s16(v336, v445); int16x8_t v542 = vaddq_s16(v540, v541); int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573); int16x8_t v543 = vaddq_s16(v543_tmp, v542); int16x8_t v544 = vaddq_s16(v451, v339); int16x8_t v545 = vaddq_s16(v340, v453); int16x8_t v546 = vaddq_s16(v544, v545); int16x8_t v547 = vaddq_s16(v458, v342); int16x8_t v548 = vaddq_s16(v343, v460); int16x8_t v549 = vaddq_s16(v547, v548); int16x8_t v550 = vaddq_s16(v546, v549); int16x8_t v551 = vaddq_s16(v543, v550); int16x8_t v552 = vaddq_s16(v549, v542); int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573); int16x8_t v553 = vaddq_s16(v553_tmp, v552); int16x8_t v554 = vaddq_s16(v469, v349); int16x8_t v555 = vaddq_s16(v350, v471); int16x8_t v556 = vaddq_s16(v554, v555); int16x8_t v557 = vaddq_s16(v556, v546); int16x8_t v558 = vaddq_s16(v557, v552); int16x8_t v559 = vaddq_s16(v553, v558); int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734); int16x8_t v561 = vaddq_s16(v551, v560); int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705); int16x8_t v563 = vaddq_s16(v539, v562); int16x8_t v564 = vaddq_s16(v446, v359); int16x8_t v565 = vaddq_s16(v360, v411); int16x8_t v566 = vaddq_s16(v564, v565); int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573); int16x8_t v567 = vaddq_s16(v567_tmp, v566); int16x8_t v568 = vaddq_s16(v454, v363); int16x8_t v569 = vaddq_s16(v364, v415); int16x8_t v570 = vaddq_s16(v568, v569); int16x8_t v571 = vaddq_s16(v419, v366); int16x8_t v572 = vaddq_s16(v367, v457); int16x8_t v573 = vaddq_s16(v571, v572); int16x8_t v574 = vaddq_s16(v570, v573); int16x8_t v575 = vaddq_s16(v567, v574); int16x8_t v576 = vaddq_s16(v461, v371); int16x8_t v577 = vaddq_s16(v372, v423); int16x8_t v578 = vaddq_s16(v576, v577); int16x8_t v579 = vaddq_s16(v427, v374); int16x8_t v580 = vaddq_s16(v375, v442); int16x8_t v581 = vaddq_s16(v579, v580); int16x8_t v582 = vaddq_s16(v578, v581); int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573); int16x8_t v583 = vaddq_s16(v583_tmp, v582); int16x8_t v584 = vaddq_s16(v472, v379); int16x8_t v585 = vaddq_s16(v380, v431); int16x8_t v586 = vaddq_s16(v584, v585); int16x8_t v587 = vaddq_s16(v435, v382); int16x8_t v588 = vaddq_s16(v383, v450); int16x8_t v589 = vaddq_s16(v587, v588); int16x8_t v590 = vaddq_s16(v586, v589); int16x8_t v591 = vaddq_s16(v590, v582); int16x8_t v592 = vaddq_s16(v583, v591); int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734); int16x8_t v594 = vaddq_s16(v575, v593); int16x8_t v595 = vaddq_s16(v581, v566); int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573); int16x8_t v596 = vaddq_s16(v596_tmp, v595); int16x8_t v597 = vaddq_s16(v589, v570); int16x8_t v598 = vaddq_s16(v573, v578); int16x8_t v599 = vaddq_s16(v597, v598); int16x8_t v600 = vaddq_s16(v596, v599); int16x8_t v601 = vaddq_s16(v598, v595); int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573); int16x8_t v602 = vaddq_s16(v602_tmp, v601); int16x8_t v603 = vaddq_s16(v508, v398); int16x8_t v604 = vaddq_s16(v399, v468); int16x8_t v605 = vaddq_s16(v603, v604); int16x8_t v606 = vaddq_s16(v605, v586); int16x8_t v607 = vaddq_s16(v606, v597); int16x8_t v608 = vaddq_s16(v607, v601); int16x8_t v609 = vaddq_s16(v602, v608); int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734); int16x8_t v611 = vaddq_s16(v600, v610); int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705); int16x8_t v613 = vaddq_s16(v594, v612); int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463); int16x8_t v615 = vaddq_s16(v563, v614); int16x8_t v616 = vaddq_s16(v565, v523); int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573); int16x8_t v617 = vaddq_s16(v617_tmp, v616); int16x8_t v618 = vaddq_s16(v569, v525); int16x8_t v619 = vaddq_s16(v526, v571); int16x8_t v620 = vaddq_s16(v618, v619); int16x8_t v621 = vaddq_s16(v617, v620); int16x8_t v622 = vaddq_s16(v577, v529); int16x8_t v623 = vaddq_s16(v530, v579); int16x8_t v624 = vaddq_s16(v622, v623); int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573); int16x8_t v625 = vaddq_s16(v625_tmp, v624); int16x8_t v626 = vaddq_s16(v585, v533); int16x8_t v627 = vaddq_s16(v534, v587); int16x8_t v628 = vaddq_s16(v626, v627); int16x8_t v629 = vaddq_s16(v628, v624); int16x8_t v630 = vaddq_s16(v625, v629); int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734); int16x8_t v632 = vaddq_s16(v621, v631); int16x8_t v633 = vaddq_s16(v580, v540); int16x8_t v634 = vaddq_s16(v541, v564); int16x8_t v635 = vaddq_s16(v633, v634); int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573); int16x8_t v636 = vaddq_s16(v636_tmp, v635); int16x8_t v637 = vaddq_s16(v588, v544); int16x8_t v638 = vaddq_s16(v545, v568); int16x8_t v639 = vaddq_s16(v637, v638); int16x8_t v640 = vaddq_s16(v572, v547); int16x8_t v641 = vaddq_s16(v548, v576); int16x8_t v642 = vaddq_s16(v640, v641); int16x8_t v643 = vaddq_s16(v639, v642); int16x8_t v644 = vaddq_s16(v636, v643); int16x8_t v645 = vaddq_s16(v642, v635); int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573); int16x8_t v646 = vaddq_s16(v646_tmp, v645); int16x8_t v647 = vaddq_s16(v604, v554); int16x8_t v648 = vaddq_s16(v555, v584); int16x8_t v649 = vaddq_s16(v647, v648); int16x8_t v650 = vaddq_s16(v649, v639); int16x8_t v651 = vaddq_s16(v650, v645); int16x8_t v652 = vaddq_s16(v646, v651); int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734); int16x8_t v654 = vaddq_s16(v644, v653); int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705); int16x8_t v656 = vaddq_s16(v632, v655); int16x8_t v657 = vaddq_s16(v634, v616); int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573); int16x8_t v658 = vaddq_s16(v658_tmp, v657); int16x8_t v659 = vaddq_s16(v638, v618); int16x8_t v660 = vaddq_s16(v619, v640); int16x8_t v661 = vaddq_s16(v659, v660); int16x8_t v662 = vaddq_s16(v658, v661); int16x8_t v663 = vaddq_s16(v641, v622); int16x8_t v664 = vaddq_s16(v623, v633); int16x8_t v665 = vaddq_s16(v663, v664); int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573); int16x8_t v666 = vaddq_s16(v666_tmp, v665); int16x8_t v667 = vaddq_s16(v648, v626); int16x8_t v668 = vaddq_s16(v627, v637); int16x8_t v669 = vaddq_s16(v667, v668); int16x8_t v670 = vaddq_s16(v669, v665); int16x8_t v671 = vaddq_s16(v666, v670); int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734); int16x8_t v673 = vaddq_s16(v662, v672); int16x8_t v674 = vaddq_s16(v664, v657); int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573); int16x8_t v675 = vaddq_s16(v675_tmp, v674); int16x8_t v676 = vaddq_s16(v668, v659); int16x8_t v677 = vaddq_s16(v660, v663); int16x8_t v678 = vaddq_s16(v676, v677); int16x8_t v679 = vaddq_s16(v675, v678); int16x8_t v680 = vaddq_s16(v677, v674); int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573); int16x8_t v681 = vaddq_s16(v681_tmp, v680); int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i); int16x8_t v683 = vaddq_s16(v682, v507); int16x8_t v684 = vaddq_s16(v683, v603); int16x8_t v685 = vaddq_s16(v684, v647); int16x8_t v686 = vaddq_s16(v685, v667); int16x8_t v687 = vaddq_s16(v686, v676); int16x8_t v688 = vaddq_s16(v687, v680); int16x8_t v689 = vaddq_s16(v681, v688); int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734); int16x8_t v691 = vaddq_s16(v679, v690); int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705); int16x8_t v693 = vaddq_s16(v673, v692); int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463); int16x8_t v695 = vaddq_s16(v656, v694); int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404); int16x8_t v697 = vaddq_s16(v615, v696); int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389); int16x8_t v699 = vaddq_s16(v522, v698); int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385); int16x8_t v701 = vaddq_s16(v317, v700); int16x8_t v702 = vsubq_s16(v0, v1); int16x8_t v703 = vsubq_s16(v4, v6); int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045); int16x8_t v704 = vaddq_s16(v704_tmp, v703); int16x8_t v705 = vaddq_s16(v702, v704); int16x8_t v706 = vsubq_s16(v11, v14); int16x8_t v707 = vsubq_s16(v17, v20); int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045); int16x8_t v708 = vaddq_s16(v708_tmp, v707); int16x8_t v709 = vaddq_s16(v706, v708); int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705); int16x8_t v711 = vaddq_s16(v705, v710); int16x8_t v712 = vsubq_s16(v27, v30); int16x8_t v713 = vsubq_s16(v35, v39); int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045); int16x8_t v714 = vaddq_s16(v714_tmp, v713); int16x8_t v715 = vaddq_s16(v712, v714); int16x8_t v716 = vsubq_s16(v44, v47); int16x8_t v717 = vsubq_s16(v50, v54); int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045); int16x8_t v718 = vaddq_s16(v718_tmp, v717); int16x8_t v719 = vaddq_s16(v716, v718); int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705); int16x8_t v721 = vaddq_s16(v715, v720); int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121); int16x8_t v723 = vaddq_s16(v711, v722); int16x8_t v724 = vsubq_s16(v63, v66); int16x8_t v725 = vsubq_s16(v71, v75); int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045); int16x8_t v726 = vaddq_s16(v726_tmp, v725); int16x8_t v727 = vaddq_s16(v724, v726); int16x8_t v728 = vsubq_s16(v82, v89); int16x8_t v729 = vsubq_s16(v92, v97); int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045); int16x8_t v730 = vaddq_s16(v730_tmp, v729); int16x8_t v731 = vaddq_s16(v728, v730); int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705); int16x8_t v733 = vaddq_s16(v727, v732); int16x8_t v734 = vsubq_s16(v104, v107); int16x8_t v735 = vsubq_s16(v112, v116); int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045); int16x8_t v736 = vaddq_s16(v736_tmp, v735); int16x8_t v737 = vaddq_s16(v734, v736); int16x8_t v738 = vsubq_s16(v121, v124); int16x8_t v739 = vsubq_s16(v127, v132); int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045); int16x8_t v740 = vaddq_s16(v740_tmp, v739); int16x8_t v741 = vaddq_s16(v738, v740); int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705); int16x8_t v743 = vaddq_s16(v737, v742); int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121); int16x8_t v745 = vaddq_s16(v733, v744); int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563); int16x8_t v747 = vaddq_s16(v723, v746); int16x8_t v748 = vsubq_s16(v143, v146); int16x8_t v749 = vsubq_s16(v151, v155); int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045); int16x8_t v750 = vaddq_s16(v750_tmp, v749); int16x8_t v751 = vaddq_s16(v748, v750); int16x8_t v752 = vsubq_s16(v162, v169); int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705); int16x8_t v754 = vsubq_s16(v172, v177); int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746); int16x8_t v756 = vaddq_s16(v753, v755); int16x8_t v757 = vaddq_s16(v751, v756); int16x8_t v758 = vsubq_s16(v186, v193); int16x8_t v759 = vsubq_s16(v202, v210); int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045); int16x8_t v760 = vaddq_s16(v760_tmp, v759); int16x8_t v761 = vaddq_s16(v758, v760); int16x8_t v762 = vsubq_s16(v215, v218); int16x8_t v763 = vsubq_s16(v221, v227); int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045); int16x8_t v764 = vaddq_s16(v764_tmp, v763); int16x8_t v765 = vaddq_s16(v762, v764); int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705); int16x8_t v767 = vaddq_s16(v761, v766); int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121); int16x8_t v769 = vaddq_s16(v757, v768); int16x8_t v770 = vsubq_s16(v236, v239); int16x8_t v771 = vsubq_s16(v244, v248); int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045); int16x8_t v772 = vaddq_s16(v772_tmp, v771); int16x8_t v773 = vaddq_s16(v770, v772); int16x8_t v774 = vsubq_s16(v255, v262); int16x8_t v775 = vsubq_s16(v265, v270); int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045); int16x8_t v776 = vaddq_s16(v776_tmp, v775); int16x8_t v777 = vaddq_s16(v774, v776); int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705); int16x8_t v779 = vaddq_s16(v773, v778); int16x8_t v780 = vsubq_s16(v277, v280); int16x8_t v781 = vsubq_s16(v285, v289); int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045); int16x8_t v782 = vaddq_s16(v782_tmp, v781); int16x8_t v783 = vaddq_s16(v780, v782); int16x8_t v784 = vsubq_s16(v294, v297); int16x8_t v785 = vsubq_s16(v300, v306); int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045); int16x8_t v786 = vaddq_s16(v786_tmp, v785); int16x8_t v787 = vaddq_s16(v784, v786); int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705); int16x8_t v789 = vaddq_s16(v783, v788); int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121); int16x8_t v791 = vaddq_s16(v779, v790); int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563); int16x8_t v793 = vaddq_s16(v769, v792); int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429); int16x8_t v795 = vaddq_s16(v747, v794); int16x8_t v796 = vsubq_s16(v319, v322); int16x8_t v797 = vsubq_s16(v327, v331); int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045); int16x8_t v798 = vaddq_s16(v798_tmp, v797); int16x8_t v799 = vaddq_s16(v796, v798); int16x8_t v800 = vsubq_s16(v338, v345); int16x8_t v801 = vsubq_s16(v348, v353); int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045); int16x8_t v802 = vaddq_s16(v802_tmp, v801); int16x8_t v803 = vaddq_s16(v800, v802); int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705); int16x8_t v805 = vaddq_s16(v799, v804); int16x8_t v806 = vsubq_s16(v362, v369); int16x8_t v807 = vsubq_s16(v378, v386); int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045); int16x8_t v808 = vaddq_s16(v808_tmp, v807); int16x8_t v809 = vaddq_s16(v806, v808); int16x8_t v810 = vsubq_s16(v391, v394); int16x8_t v811 = vsubq_s16(v397, v403); int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045); int16x8_t v812 = vaddq_s16(v812_tmp, v811); int16x8_t v813 = vaddq_s16(v810, v812); int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705); int16x8_t v815 = vaddq_s16(v809, v814); int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121); int16x8_t v817 = vaddq_s16(v805, v816); int16x8_t v818 = vsubq_s16(v414, v421); int16x8_t v819 = vsubq_s16(v430, v438); int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045); int16x8_t v820 = vaddq_s16(v820_tmp, v819); int16x8_t v821 = vaddq_s16(v818, v820); int16x8_t v822 = vsubq_s16(v449, v464); int16x8_t v823 = vsubq_s16(v467, v476); int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045); int16x8_t v824 = vaddq_s16(v824_tmp, v823); int16x8_t v825 = vaddq_s16(v822, v824); int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705); int16x8_t v827 = vaddq_s16(v821, v826); int16x8_t v828 = vsubq_s16(v483, v486); int16x8_t v829 = vsubq_s16(v491, v495); int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045); int16x8_t v830 = vaddq_s16(v830_tmp, v829); int16x8_t v831 = vaddq_s16(v828, v830); int16x8_t v832 = vsubq_s16(v500, v503); int16x8_t v833 = vsubq_s16(v506, v513); int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045); int16x8_t v834 = vaddq_s16(v834_tmp, v833); int16x8_t v835 = vaddq_s16(v832, v834); int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705); int16x8_t v837 = vaddq_s16(v831, v836); int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121); int16x8_t v839 = vaddq_s16(v827, v838); int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563); int16x8_t v841 = vaddq_s16(v817, v840); int16x8_t v842 = vsubq_s16(v524, v527); int16x8_t v843 = vsubq_s16(v532, v536); int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045); int16x8_t v844 = vaddq_s16(v844_tmp, v843); int16x8_t v845 = vaddq_s16(v842, v844); int16x8_t v846 = vsubq_s16(v543, v550); int16x8_t v847 = vsubq_s16(v553, v558); int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045); int16x8_t v848 = vaddq_s16(v848_tmp, v847); int16x8_t v849 = vaddq_s16(v846, v848); int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705); int16x8_t v851 = vaddq_s16(v845, v850); int16x8_t v852 = vsubq_s16(v567, v574); int16x8_t v853 = vsubq_s16(v583, v591); int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045); int16x8_t v854 = vaddq_s16(v854_tmp, v853); int16x8_t v855 = vaddq_s16(v852, v854); int16x8_t v856 = vsubq_s16(v596, v599); int16x8_t v857 = vsubq_s16(v602, v608); int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045); int16x8_t v858 = vaddq_s16(v858_tmp, v857); int16x8_t v859 = vaddq_s16(v856, v858); int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705); int16x8_t v861 = vaddq_s16(v855, v860); int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121); int16x8_t v863 = vaddq_s16(v851, v862); int16x8_t v864 = vsubq_s16(v617, v620); int16x8_t v865 = vsubq_s16(v625, v629); int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045); int16x8_t v866 = vaddq_s16(v866_tmp, v865); int16x8_t v867 = vaddq_s16(v864, v866); int16x8_t v868 = vsubq_s16(v636, v643); int16x8_t v869 = vsubq_s16(v646, v651); int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045); int16x8_t v870 = vaddq_s16(v870_tmp, v869); int16x8_t v871 = vaddq_s16(v868, v870); int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705); int16x8_t v873 = vaddq_s16(v867, v872); int16x8_t v874 = vsubq_s16(v658, v661); int16x8_t v875 = vsubq_s16(v666, v670); int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045); int16x8_t v876 = vaddq_s16(v876_tmp, v875); int16x8_t v877 = vaddq_s16(v874, v876); int16x8_t v878 = vsubq_s16(v675, v678); int16x8_t v879 = vsubq_s16(v681, v688); int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045); int16x8_t v880 = vaddq_s16(v880_tmp, v879); int16x8_t v881 = vaddq_s16(v878, v880); int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705); int16x8_t v883 = vaddq_s16(v877, v882); int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121); int16x8_t v885 = vaddq_s16(v873, v884); int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563); int16x8_t v887 = vaddq_s16(v863, v886); int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429); int16x8_t v889 = vaddq_s16(v841, v888); int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395); int16x8_t v891 = vaddq_s16(v795, v890); int16x8_t v892 = vsubq_s16(v702, v704); int16x8_t v893 = vsubq_s16(v706, v708); int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490); int16x8_t v895 = vaddq_s16(v892, v894); int16x8_t v896 = vsubq_s16(v712, v714); int16x8_t v897 = vsubq_s16(v716, v718); int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490); int16x8_t v899 = vaddq_s16(v896, v898); int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578); int16x8_t v901 = vaddq_s16(v895, v900); int16x8_t v902 = vsubq_s16(v724, v726); int16x8_t v903 = vsubq_s16(v728, v730); int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490); int16x8_t v905 = vaddq_s16(v902, v904); int16x8_t v906 = vsubq_s16(v734, v736); int16x8_t v907 = vsubq_s16(v738, v740); int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490); int16x8_t v909 = vaddq_s16(v906, v908); int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578); int16x8_t v911 = vaddq_s16(v905, v910); int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890); int16x8_t v913 = vaddq_s16(v901, v912); int16x8_t v914 = vsubq_s16(v748, v750); int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045); int16x8_t v915 = vaddq_s16(v915_tmp, v754); int16x8_t v916 = vsubq_s16(v752, v915); int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490); int16x8_t v918 = vaddq_s16(v914, v917); int16x8_t v919 = vsubq_s16(v758, v760); int16x8_t v920 = vsubq_s16(v762, v764); int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490); int16x8_t v922 = vaddq_s16(v919, v921); int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578); int16x8_t v924 = vaddq_s16(v918, v923); int16x8_t v925 = vsubq_s16(v770, v772); int16x8_t v926 = vsubq_s16(v774, v776); int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490); int16x8_t v928 = vaddq_s16(v925, v927); int16x8_t v929 = vsubq_s16(v780, v782); int16x8_t v930 = vsubq_s16(v784, v786); int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490); int16x8_t v932 = vaddq_s16(v929, v931); int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578); int16x8_t v934 = vaddq_s16(v928, v933); int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890); int16x8_t v936 = vaddq_s16(v924, v935); int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508); int16x8_t v938 = vaddq_s16(v913, v937); int16x8_t v939 = vsubq_s16(v796, v798); int16x8_t v940 = vsubq_s16(v800, v802); int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490); int16x8_t v942 = vaddq_s16(v939, v941); int16x8_t v943 = vsubq_s16(v806, v808); int16x8_t v944 = vsubq_s16(v810, v812); int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490); int16x8_t v946 = vaddq_s16(v943, v945); int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578); int16x8_t v948 = vaddq_s16(v942, v947); int16x8_t v949 = vsubq_s16(v818, v820); int16x8_t v950 = vsubq_s16(v822, v824); int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490); int16x8_t v952 = vaddq_s16(v949, v951); int16x8_t v953 = vsubq_s16(v828, v830); int16x8_t v954 = vsubq_s16(v832, v834); int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490); int16x8_t v956 = vaddq_s16(v953, v955); int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578); int16x8_t v958 = vaddq_s16(v952, v957); int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890); int16x8_t v960 = vaddq_s16(v948, v959); int16x8_t v961 = vsubq_s16(v842, v844); int16x8_t v962 = vsubq_s16(v846, v848); int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490); int16x8_t v964 = vaddq_s16(v961, v963); int16x8_t v965 = vsubq_s16(v852, v854); int16x8_t v966 = vsubq_s16(v856, v858); int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490); int16x8_t v968 = vaddq_s16(v965, v967); int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578); int16x8_t v970 = vaddq_s16(v964, v969); int16x8_t v971 = vsubq_s16(v864, v866); int16x8_t v972 = vsubq_s16(v868, v870); int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490); int16x8_t v974 = vaddq_s16(v971, v973); int16x8_t v975 = vsubq_s16(v874, v876); int16x8_t v976 = vsubq_s16(v878, v880); int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490); int16x8_t v978 = vaddq_s16(v975, v977); int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578); int16x8_t v980 = vaddq_s16(v974, v979); int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890); int16x8_t v982 = vaddq_s16(v970, v981); int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508); int16x8_t v984 = vaddq_s16(v960, v983); int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415); int16x8_t v986 = vaddq_s16(v938, v985); int16x8_t v987 = vsubq_s16(v2, v8); int16x8_t v988 = vsubq_s16(v15, v22); int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446); int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2); int16x8_t v990 = vaddq_s16(v987, v989); int16x8_t v991 = vsubq_s16(v31, v41); int16x8_t v992 = vsubq_s16(v48, v56); int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446); int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2); int16x8_t v994 = vaddq_s16(v991, v993); int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195); int16x8_t v996 = vaddq_s16(v990, v995); int16x8_t v997 = vsubq_s16(v67, v77); int16x8_t v998 = vsubq_s16(v90, v99); int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446); int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2); int16x8_t v1000 = vaddq_s16(v997, v999); int16x8_t v1001 = vsubq_s16(v108, v118); int16x8_t v1002 = vsubq_s16(v125, v134); int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446); int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2); int16x8_t v1004 = vaddq_s16(v1001, v1003); int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195); int16x8_t v1006 = vaddq_s16(v1000, v1005); int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401); int16x8_t v1008 = vaddq_s16(v996, v1007); int16x8_t v1009 = vsubq_s16(v147, v157); int16x8_t v1010 = vsubq_s16(v170, v179); int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446); int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2); int16x8_t v1012 = vaddq_s16(v1009, v1011); int16x8_t v1013 = vsubq_s16(v194, v212); int16x8_t v1014 = vsubq_s16(v219, v229); int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446); int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2); int16x8_t v1016 = vaddq_s16(v1013, v1015); int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195); int16x8_t v1018 = vaddq_s16(v1012, v1017); int16x8_t v1019 = vsubq_s16(v240, v250); int16x8_t v1020 = vsubq_s16(v263, v272); int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446); int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2); int16x8_t v1022 = vaddq_s16(v1019, v1021); int16x8_t v1023 = vsubq_s16(v281, v291); int16x8_t v1024 = vsubq_s16(v298, v308); int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446); int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2); int16x8_t v1026 = vaddq_s16(v1023, v1025); int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195); int16x8_t v1028 = vaddq_s16(v1022, v1027); int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401); int16x8_t v1030 = vaddq_s16(v1018, v1029); int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629); int16x8_t v1032 = vaddq_s16(v1008, v1031); int16x8_t v1033 = vsubq_s16(v323, v333); int16x8_t v1034 = vsubq_s16(v346, v355); int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446); int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2); int16x8_t v1036 = vaddq_s16(v1033, v1035); int16x8_t v1037 = vsubq_s16(v370, v388); int16x8_t v1038 = vsubq_s16(v395, v405); int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446); int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2); int16x8_t v1040 = vaddq_s16(v1037, v1039); int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195); int16x8_t v1042 = vaddq_s16(v1036, v1041); int16x8_t v1043 = vsubq_s16(v422, v440); int16x8_t v1044 = vsubq_s16(v465, v478); int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446); int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2); int16x8_t v1046 = vaddq_s16(v1043, v1045); int16x8_t v1047 = vsubq_s16(v487, v497); int16x8_t v1048 = vsubq_s16(v504, v515); int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446); int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2); int16x8_t v1050 = vaddq_s16(v1047, v1049); int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195); int16x8_t v1052 = vaddq_s16(v1046, v1051); int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401); int16x8_t v1054 = vaddq_s16(v1042, v1053); int16x8_t v1055 = vsubq_s16(v528, v538); int16x8_t v1056 = vsubq_s16(v551, v560); int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446); int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2); int16x8_t v1058 = vaddq_s16(v1055, v1057); int16x8_t v1059 = vsubq_s16(v575, v593); int16x8_t v1060 = vsubq_s16(v600, v610); int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446); int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2); int16x8_t v1062 = vaddq_s16(v1059, v1061); int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195); int16x8_t v1064 = vaddq_s16(v1058, v1063); int16x8_t v1065 = vsubq_s16(v621, v631); int16x8_t v1066 = vsubq_s16(v644, v653); int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446); int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2); int16x8_t v1068 = vaddq_s16(v1065, v1067); int16x8_t v1069 = vsubq_s16(v662, v672); int16x8_t v1070 = vsubq_s16(v679, v690); int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446); int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2); int16x8_t v1072 = vaddq_s16(v1069, v1071); int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195); int16x8_t v1074 = vaddq_s16(v1068, v1073); int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401); int16x8_t v1076 = vaddq_s16(v1064, v1075); int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629); int16x8_t v1078 = vaddq_s16(v1054, v1077); int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445); int16x8_t v1080 = vaddq_s16(v1032, v1079); int16x8_t v1081 = vsubq_s16(v987, v989); int16x8_t v1082 = vsubq_s16(v991, v993); int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826); int16x8_t v1084 = vaddq_s16(v1081, v1083); int16x8_t v1085 = vsubq_s16(v997, v999); int16x8_t v1086 = vsubq_s16(v1001, v1003); int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826); int16x8_t v1088 = vaddq_s16(v1085, v1087); int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124); int16x8_t v1090 = vaddq_s16(v1084, v1089); int16x8_t v1091 = vsubq_s16(v1009, v1011); int16x8_t v1092 = vsubq_s16(v1013, v1015); int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826); int16x8_t v1094 = vaddq_s16(v1091, v1093); int16x8_t v1095 = vsubq_s16(v1019, v1021); int16x8_t v1096 = vsubq_s16(v1023, v1025); int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826); int16x8_t v1098 = vaddq_s16(v1095, v1097); int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124); int16x8_t v1100 = vaddq_s16(v1094, v1099); int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792); int16x8_t v1102 = vaddq_s16(v1090, v1101); int16x8_t v1103 = vsubq_s16(v1033, v1035); int16x8_t v1104 = vsubq_s16(v1037, v1039); int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826); int16x8_t v1106 = vaddq_s16(v1103, v1105); int16x8_t v1107 = vsubq_s16(v1043, v1045); int16x8_t v1108 = vsubq_s16(v1047, v1049); int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826); int16x8_t v1110 = vaddq_s16(v1107, v1109); int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124); int16x8_t v1112 = vaddq_s16(v1106, v1111); int16x8_t v1113 = vsubq_s16(v1055, v1057); int16x8_t v1114 = vsubq_s16(v1059, v1061); int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826); int16x8_t v1116 = vaddq_s16(v1113, v1115); int16x8_t v1117 = vsubq_s16(v1065, v1067); int16x8_t v1118 = vsubq_s16(v1069, v1071); int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826); int16x8_t v1120 = vaddq_s16(v1117, v1119); int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124); int16x8_t v1122 = vaddq_s16(v1116, v1121); int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792); int16x8_t v1124 = vaddq_s16(v1112, v1123); int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484); int16x8_t v1126 = vaddq_s16(v1102, v1125); int16x8_t v1127 = vsubq_s16(v892, v894); int16x8_t v1128 = vsubq_s16(v896, v898); int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988); int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128); int16x8_t v1130 = vaddq_s16(v1127, v1129); int16x8_t v1131 = vsubq_s16(v902, v904); int16x8_t v1132 = vsubq_s16(v906, v908); int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988); int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132); int16x8_t v1134 = vaddq_s16(v1131, v1133); int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102); int16x8_t v1136 = vaddq_s16(v1130, v1135); int16x8_t v1137 = vsubq_s16(v914, v917); int16x8_t v1138 = vsubq_s16(v919, v921); int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988); int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138); int16x8_t v1140 = vaddq_s16(v1137, v1139); int16x8_t v1141 = vsubq_s16(v925, v927); int16x8_t v1142 = vsubq_s16(v929, v931); int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988); int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142); int16x8_t v1144 = vaddq_s16(v1141, v1143); int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102); int16x8_t v1146 = vaddq_s16(v1140, v1145); int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000); int16x8_t v1148 = vaddq_s16(v1136, v1147); int16x8_t v1149 = vsubq_s16(v939, v941); int16x8_t v1150 = vsubq_s16(v943, v945); int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988); int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150); int16x8_t v1152 = vaddq_s16(v1149, v1151); int16x8_t v1153 = vsubq_s16(v949, v951); int16x8_t v1154 = vsubq_s16(v953, v955); int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988); int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154); int16x8_t v1156 = vaddq_s16(v1153, v1155); int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102); int16x8_t v1158 = vaddq_s16(v1152, v1157); int16x8_t v1159 = vsubq_s16(v961, v963); int16x8_t v1160 = vsubq_s16(v965, v967); int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988); int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160); int16x8_t v1162 = vaddq_s16(v1159, v1161); int16x8_t v1163 = vsubq_s16(v971, v973); int16x8_t v1164 = vsubq_s16(v975, v977); int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988); int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164); int16x8_t v1166 = vaddq_s16(v1163, v1165); int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102); int16x8_t v1168 = vaddq_s16(v1162, v1167); int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000); int16x8_t v1170 = vaddq_s16(v1158, v1169); int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534); int16x8_t v1172 = vaddq_s16(v1148, v1171); int16x8_t v1173 = vsubq_s16(v705, v710); int16x8_t v1174 = vsubq_s16(v715, v720); int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673); int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174); int16x8_t v1176 = vaddq_s16(v1173, v1175); int16x8_t v1177 = vsubq_s16(v727, v732); int16x8_t v1178 = vsubq_s16(v737, v742); int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673); int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178); int16x8_t v1180 = vaddq_s16(v1177, v1179); int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398); int16x8_t v1182 = vaddq_s16(v1176, v1181); int16x8_t v1183 = vsubq_s16(v751, v756); int16x8_t v1184 = vsubq_s16(v761, v766); int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673); int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184); int16x8_t v1186 = vaddq_s16(v1183, v1185); int16x8_t v1187 = vsubq_s16(v773, v778); int16x8_t v1188 = vsubq_s16(v783, v788); int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673); int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188); int16x8_t v1190 = vaddq_s16(v1187, v1189); int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398); int16x8_t v1192 = vaddq_s16(v1186, v1191); int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255); int16x8_t v1194 = vaddq_s16(v1182, v1193); int16x8_t v1195 = vsubq_s16(v799, v804); int16x8_t v1196 = vsubq_s16(v809, v814); int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673); int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196); int16x8_t v1198 = vaddq_s16(v1195, v1197); int16x8_t v1199 = vsubq_s16(v821, v826); int16x8_t v1200 = vsubq_s16(v831, v836); int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673); int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200); int16x8_t v1202 = vaddq_s16(v1199, v1201); int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398); int16x8_t v1204 = vaddq_s16(v1198, v1203); int16x8_t v1205 = vsubq_s16(v845, v850); int16x8_t v1206 = vsubq_s16(v855, v860); int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673); int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206); int16x8_t v1208 = vaddq_s16(v1205, v1207); int16x8_t v1209 = vsubq_s16(v867, v872); int16x8_t v1210 = vsubq_s16(v877, v882); int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673); int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210); int16x8_t v1212 = vaddq_s16(v1209, v1211); int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398); int16x8_t v1214 = vaddq_s16(v1208, v1213); int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255); int16x8_t v1216 = vaddq_s16(v1204, v1215); int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595); int16x8_t v1218 = vaddq_s16(v1194, v1217); int16x8_t v1219 = vsubq_s16(v9, v24); int16x8_t v1220 = vsubq_s16(v42, v58); int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314); int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5); int16x8_t v1222 = vaddq_s16(v1219, v1221); int16x8_t v1223 = vsubq_s16(v78, v101); int16x8_t v1224 = vsubq_s16(v119, v136); int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314); int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5); int16x8_t v1226 = vaddq_s16(v1223, v1225); int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112); int16x8_t v1228 = vaddq_s16(v1222, v1227); int16x8_t v1229 = vsubq_s16(v158, v181); int16x8_t v1230 = vsubq_s16(v213, v231); int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314); int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5); int16x8_t v1232 = vaddq_s16(v1229, v1231); int16x8_t v1233 = vsubq_s16(v251, v274); int16x8_t v1234 = vsubq_s16(v292, v310); int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314); int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5); int16x8_t v1236 = vaddq_s16(v1233, v1235); int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112); int16x8_t v1238 = vaddq_s16(v1232, v1237); int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561); int16x8_t v1240 = vaddq_s16(v1228, v1239); int16x8_t v1241 = vsubq_s16(v334, v357); int16x8_t v1242 = vsubq_s16(v389, v407); int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314); int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5); int16x8_t v1244 = vaddq_s16(v1241, v1243); int16x8_t v1245 = vsubq_s16(v441, v480); int16x8_t v1246 = vsubq_s16(v498, v517); int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314); int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5); int16x8_t v1248 = vaddq_s16(v1245, v1247); int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112); int16x8_t v1250 = vaddq_s16(v1244, v1249); int16x8_t v1251 = vsubq_s16(v539, v562); int16x8_t v1252 = vsubq_s16(v594, v612); int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314); int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5); int16x8_t v1254 = vaddq_s16(v1251, v1253); int16x8_t v1255 = vsubq_s16(v632, v655); int16x8_t v1256 = vsubq_s16(v673, v692); int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314); int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5); int16x8_t v1258 = vaddq_s16(v1255, v1257); int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112); int16x8_t v1260 = vaddq_s16(v1254, v1259); int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561); int16x8_t v1262 = vaddq_s16(v1250, v1261); int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666); int16x8_t v1264 = vaddq_s16(v1240, v1263); int16x8_t v1265 = vsubq_s16(v1219, v1221); int16x8_t v1266 = vsubq_s16(v1223, v1225); int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397); int16x8_t v1268 = vaddq_s16(v1265, v1267); int16x8_t v1269 = vsubq_s16(v1229, v1231); int16x8_t v1270 = vsubq_s16(v1233, v1235); int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397); int16x8_t v1272 = vaddq_s16(v1269, v1271); int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921); int16x8_t v1274 = vaddq_s16(v1268, v1273); int16x8_t v1275 = vsubq_s16(v1241, v1243); int16x8_t v1276 = vsubq_s16(v1245, v1247); int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397); int16x8_t v1278 = vaddq_s16(v1275, v1277); int16x8_t v1279 = vsubq_s16(v1251, v1253); int16x8_t v1280 = vsubq_s16(v1255, v1257); int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397); int16x8_t v1282 = vaddq_s16(v1279, v1281); int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921); int16x8_t v1284 = vaddq_s16(v1278, v1283); int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747); int16x8_t v1286 = vaddq_s16(v1274, v1285); int16x8_t v1287 = vsubq_s16(v1173, v1175); int16x8_t v1288 = vsubq_s16(v1177, v1179); int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504); int16x8_t v1290 = vaddq_s16(v1287, v1289); int16x8_t v1291 = vsubq_s16(v1183, v1185); int16x8_t v1292 = vsubq_s16(v1187, v1189); int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504); int16x8_t v1294 = vaddq_s16(v1291, v1293); int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343); int16x8_t v1296 = vaddq_s16(v1290, v1295); int16x8_t v1297 = vsubq_s16(v1195, v1197); int16x8_t v1298 = vsubq_s16(v1199, v1201); int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504); int16x8_t v1300 = vaddq_s16(v1297, v1299); int16x8_t v1301 = vsubq_s16(v1205, v1207); int16x8_t v1302 = vsubq_s16(v1209, v1211); int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504); int16x8_t v1304 = vaddq_s16(v1301, v1303); int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343); int16x8_t v1306 = vaddq_s16(v1300, v1305); int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840); int16x8_t v1308 = vaddq_s16(v1296, v1307); int16x8_t v1309 = vsubq_s16(v1127, v1129); int16x8_t v1310 = vsubq_s16(v1131, v1133); int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869); int16x8_t v1312 = vaddq_s16(v1309, v1311); int16x8_t v1313 = vsubq_s16(v1137, v1139); int16x8_t v1314 = vsubq_s16(v1141, v1143); int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869); int16x8_t v1316 = vaddq_s16(v1313, v1315); int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830); int16x8_t v1318 = vaddq_s16(v1312, v1317); int16x8_t v1319 = vsubq_s16(v1149, v1151); int16x8_t v1320 = vsubq_s16(v1153, v1155); int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869); int16x8_t v1322 = vaddq_s16(v1319, v1321); int16x8_t v1323 = vsubq_s16(v1159, v1161); int16x8_t v1324 = vsubq_s16(v1163, v1165); int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869); int16x8_t v1326 = vaddq_s16(v1323, v1325); int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830); int16x8_t v1328 = vaddq_s16(v1322, v1327); int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944); int16x8_t v1330 = vaddq_s16(v1318, v1329); int16x8_t v1331 = vsubq_s16(v1081, v1083); int16x8_t v1332 = vsubq_s16(v1085, v1087); int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552); int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332); int16x8_t v1334 = vaddq_s16(v1331, v1333); int16x8_t v1335 = vsubq_s16(v1091, v1093); int16x8_t v1336 = vsubq_s16(v1095, v1097); int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552); int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336); int16x8_t v1338 = vaddq_s16(v1335, v1337); int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393); int16x8_t v1340 = vaddq_s16(v1334, v1339); int16x8_t v1341 = vsubq_s16(v1103, v1105); int16x8_t v1342 = vsubq_s16(v1107, v1109); int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552); int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342); int16x8_t v1344 = vaddq_s16(v1341, v1343); int16x8_t v1345 = vsubq_s16(v1113, v1115); int16x8_t v1346 = vsubq_s16(v1117, v1119); int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552); int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346); int16x8_t v1348 = vaddq_s16(v1345, v1347); int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393); int16x8_t v1350 = vaddq_s16(v1344, v1349); int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059); int16x8_t v1352 = vaddq_s16(v1340, v1351); int16x8_t v1353 = vsubq_s16(v990, v995); int16x8_t v1354 = vsubq_s16(v1000, v1005); int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865); int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354); int16x8_t v1356 = vaddq_s16(v1353, v1355); int16x8_t v1357 = vsubq_s16(v1012, v1017); int16x8_t v1358 = vsubq_s16(v1022, v1027); int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865); int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358); int16x8_t v1360 = vaddq_s16(v1357, v1359); int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040); int16x8_t v1362 = vaddq_s16(v1356, v1361); int16x8_t v1363 = vsubq_s16(v1036, v1041); int16x8_t v1364 = vsubq_s16(v1046, v1051); int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865); int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364); int16x8_t v1366 = vaddq_s16(v1363, v1365); int16x8_t v1367 = vsubq_s16(v1058, v1063); int16x8_t v1368 = vsubq_s16(v1068, v1073); int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865); int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368); int16x8_t v1370 = vaddq_s16(v1367, v1369); int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040); int16x8_t v1372 = vaddq_s16(v1366, v1371); int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187); int16x8_t v1374 = vaddq_s16(v1362, v1373); int16x8_t v1375 = vsubq_s16(v895, v900); int16x8_t v1376 = vsubq_s16(v905, v910); int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893); int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2); int16x8_t v1378 = vaddq_s16(v1375, v1377); int16x8_t v1379 = vsubq_s16(v918, v923); int16x8_t v1380 = vsubq_s16(v928, v933); int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893); int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2); int16x8_t v1382 = vaddq_s16(v1379, v1381); int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783); int16x8_t v1384 = vaddq_s16(v1378, v1383); int16x8_t v1385 = vsubq_s16(v942, v947); int16x8_t v1386 = vsubq_s16(v952, v957); int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893); int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2); int16x8_t v1388 = vaddq_s16(v1385, v1387); int16x8_t v1389 = vsubq_s16(v964, v969); int16x8_t v1390 = vsubq_s16(v974, v979); int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893); int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2); int16x8_t v1392 = vaddq_s16(v1389, v1391); int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783); int16x8_t v1394 = vaddq_s16(v1388, v1393); int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326); int16x8_t v1396 = vaddq_s16(v1384, v1395); int16x8_t v1397 = vsubq_s16(v711, v722); int16x8_t v1398 = vsubq_s16(v733, v744); int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357); int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3); int16x8_t v1400 = vaddq_s16(v1397, v1399); int16x8_t v1401 = vsubq_s16(v757, v768); int16x8_t v1402 = vsubq_s16(v779, v790); int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357); int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3); int16x8_t v1404 = vaddq_s16(v1401, v1403); int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637); int16x8_t v1406 = vaddq_s16(v1400, v1405); int16x8_t v1407 = vsubq_s16(v805, v816); int16x8_t v1408 = vsubq_s16(v827, v838); int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357); int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3); int16x8_t v1410 = vaddq_s16(v1407, v1409); int16x8_t v1411 = vsubq_s16(v851, v862); int16x8_t v1412 = vsubq_s16(v873, v884); int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357); int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3); int16x8_t v1414 = vaddq_s16(v1411, v1413); int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637); int16x8_t v1416 = vaddq_s16(v1410, v1415); int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479); int16x8_t v1418 = vaddq_s16(v1406, v1417); int16x8_t v1419 = vsubq_s16(v25, v60); int16x8_t v1420 = vsubq_s16(v102, v138); int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226); int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10); int16x8_t v1422 = vaddq_s16(v1419, v1421); int16x8_t v1423 = vsubq_s16(v182, v233); int16x8_t v1424 = vsubq_s16(v275, v312); int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226); int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10); int16x8_t v1426 = vaddq_s16(v1423, v1425); int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622); int16x8_t v1428 = vaddq_s16(v1422, v1427); int16x8_t v1429 = vsubq_s16(v358, v409); int16x8_t v1430 = vsubq_s16(v481, v519); int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226); int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10); int16x8_t v1432 = vaddq_s16(v1429, v1431); int16x8_t v1433 = vsubq_s16(v563, v614); int16x8_t v1434 = vsubq_s16(v656, v694); int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226); int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10); int16x8_t v1436 = vaddq_s16(v1433, v1435); int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622); int16x8_t v1438 = vaddq_s16(v1432, v1437); int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646); int16x8_t v1440 = vaddq_s16(v1428, v1439); int16x8_t v1441 = vsubq_s16(v1419, v1421); int16x8_t v1442 = vsubq_s16(v1423, v1425); int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761); int16x8_t v1444 = vaddq_s16(v1441, v1443); int16x8_t v1445 = vsubq_s16(v1429, v1431); int16x8_t v1446 = vsubq_s16(v1433, v1435); int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761); int16x8_t v1448 = vaddq_s16(v1445, v1447); int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826); int16x8_t v1450 = vaddq_s16(v1444, v1449); int16x8_t v1451 = vsubq_s16(v1397, v1399); int16x8_t v1452 = vsubq_s16(v1401, v1403); int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084); int16x8_t v1454 = vaddq_s16(v1451, v1453); int16x8_t v1455 = vsubq_s16(v1407, v1409); int16x8_t v1456 = vsubq_s16(v1411, v1413); int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084); int16x8_t v1458 = vaddq_s16(v1455, v1457); int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021); int16x8_t v1460 = vaddq_s16(v1454, v1459); int16x8_t v1461 = vsubq_s16(v1375, v1377); int16x8_t v1462 = vsubq_s16(v1379, v1381); int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631); int16x8_t v1464 = vaddq_s16(v1461, v1463); int16x8_t v1465 = vsubq_s16(v1385, v1387); int16x8_t v1466 = vsubq_s16(v1389, v1391); int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631); int16x8_t v1468 = vaddq_s16(v1465, v1467); int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231); int16x8_t v1470 = vaddq_s16(v1464, v1469); int16x8_t v1471 = vsubq_s16(v1353, v1355); int16x8_t v1472 = vsubq_s16(v1357, v1359); int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454); int16x8_t v1474 = vaddq_s16(v1471, v1473); int16x8_t v1475 = vsubq_s16(v1363, v1365); int16x8_t v1476 = vsubq_s16(v1367, v1369); int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454); int16x8_t v1478 = vaddq_s16(v1475, v1477); int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458); int16x8_t v1480 = vaddq_s16(v1474, v1479); int16x8_t v1481 = vsubq_s16(v1331, v1333); int16x8_t v1482 = vsubq_s16(v1335, v1337); int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624); int16x8_t v1484 = vaddq_s16(v1481, v1483); int16x8_t v1485 = vsubq_s16(v1341, v1343); int16x8_t v1486 = vsubq_s16(v1345, v1347); int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624); int16x8_t v1488 = vaddq_s16(v1485, v1487); int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702); int16x8_t v1490 = vaddq_s16(v1484, v1489); int16x8_t v1491 = vsubq_s16(v1309, v1311); int16x8_t v1492 = vsubq_s16(v1313, v1315); int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472); int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492); int16x8_t v1494 = vaddq_s16(v1491, v1493); int16x8_t v1495 = vsubq_s16(v1319, v1321); int16x8_t v1496 = vsubq_s16(v1323, v1325); int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472); int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496); int16x8_t v1498 = vaddq_s16(v1495, v1497); int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964); int16x8_t v1500 = vaddq_s16(v1494, v1499); int16x8_t v1501 = vsubq_s16(v1287, v1289); int16x8_t v1502 = vsubq_s16(v1291, v1293); int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672); int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502); int16x8_t v1504 = vaddq_s16(v1501, v1503); int16x8_t v1505 = vsubq_s16(v1297, v1299); int16x8_t v1506 = vsubq_s16(v1301, v1303); int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672); int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506); int16x8_t v1508 = vaddq_s16(v1505, v1507); int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245); int16x8_t v1510 = vaddq_s16(v1504, v1509); int16x8_t v1511 = vsubq_s16(v1265, v1267); int16x8_t v1512 = vsubq_s16(v1269, v1271); int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662); int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512); int16x8_t v1514 = vaddq_s16(v1511, v1513); int16x8_t v1515 = vsubq_s16(v1275, v1277); int16x8_t v1516 = vsubq_s16(v1279, v1281); int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662); int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516); int16x8_t v1518 = vaddq_s16(v1515, v1517); int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546); int16x8_t v1520 = vaddq_s16(v1514, v1519); int16x8_t v1521 = vsubq_s16(v1222, v1227); int16x8_t v1522 = vsubq_s16(v1232, v1237); int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756); int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522); int16x8_t v1524 = vaddq_s16(v1521, v1523); int16x8_t v1525 = vsubq_s16(v1244, v1249); int16x8_t v1526 = vsubq_s16(v1254, v1259); int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756); int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526); int16x8_t v1528 = vaddq_s16(v1525, v1527); int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869); int16x8_t v1530 = vaddq_s16(v1524, v1529); int16x8_t v1531 = vsubq_s16(v1176, v1181); int16x8_t v1532 = vsubq_s16(v1186, v1191); int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463); int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532); int16x8_t v1534 = vaddq_s16(v1531, v1533); int16x8_t v1535 = vsubq_s16(v1198, v1203); int16x8_t v1536 = vsubq_s16(v1208, v1213); int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463); int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536); int16x8_t v1538 = vaddq_s16(v1535, v1537); int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216); int16x8_t v1540 = vaddq_s16(v1534, v1539); int16x8_t v1541 = vsubq_s16(v1130, v1135); int16x8_t v1542 = vsubq_s16(v1140, v1145); int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661); int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542); int16x8_t v1544 = vaddq_s16(v1541, v1543); int16x8_t v1545 = vsubq_s16(v1152, v1157); int16x8_t v1546 = vsubq_s16(v1162, v1167); int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661); int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546); int16x8_t v1548 = vaddq_s16(v1545, v1547); int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587); int16x8_t v1550 = vaddq_s16(v1544, v1549); int16x8_t v1551 = vsubq_s16(v1084, v1089); int16x8_t v1552 = vsubq_s16(v1094, v1099); int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242); int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2); int16x8_t v1554 = vaddq_s16(v1551, v1553); int16x8_t v1555 = vsubq_s16(v1106, v1111); int16x8_t v1556 = vsubq_s16(v1116, v1121); int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242); int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2); int16x8_t v1558 = vaddq_s16(v1555, v1557); int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985); int16x8_t v1560 = vaddq_s16(v1554, v1559); int16x8_t v1561 = vsubq_s16(v996, v1007); int16x8_t v1562 = vsubq_s16(v1018, v1029); int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298); int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2); int16x8_t v1564 = vaddq_s16(v1561, v1563); int16x8_t v1565 = vsubq_s16(v1042, v1053); int16x8_t v1566 = vsubq_s16(v1064, v1075); int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298); int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2); int16x8_t v1568 = vaddq_s16(v1565, v1567); int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412); int16x8_t v1570 = vaddq_s16(v1564, v1569); int16x8_t v1571 = vsubq_s16(v901, v912); int16x8_t v1572 = vsubq_s16(v924, v935); int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773); int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4); int16x8_t v1574 = vaddq_s16(v1571, v1573); int16x8_t v1575 = vsubq_s16(v948, v959); int16x8_t v1576 = vsubq_s16(v970, v981); int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773); int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4); int16x8_t v1578 = vaddq_s16(v1575, v1577); int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871); int16x8_t v1580 = vaddq_s16(v1574, v1579); int16x8_t v1581 = vsubq_s16(v723, v746); int16x8_t v1582 = vsubq_s16(v769, v792); int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108); int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6); int16x8_t v1584 = vaddq_s16(v1581, v1583); int16x8_t v1585 = vsubq_s16(v817, v840); int16x8_t v1586 = vsubq_s16(v863, v886); int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108); int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6); int16x8_t v1588 = vaddq_s16(v1585, v1587); int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363); int16x8_t v1590 = vaddq_s16(v1584, v1589); int16x8_t v1591 = vsubq_s16(v61, v140); int16x8_t v1592 = vsubq_s16(v234, v314); int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251); int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20); int16x8_t v1594 = vaddq_s16(v1591, v1593); int16x8_t v1595 = vsubq_s16(v410, v521); int16x8_t v1596 = vsubq_s16(v615, v696); int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251); int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20); int16x8_t v1598 = vaddq_s16(v1595, v1597); int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891); int16x8_t v1600 = vaddq_s16(v1594, v1599); int16x8_t v1601 = vsubq_s16(v1591, v1593); int16x8_t v1602 = vsubq_s16(v1595, v1597); int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460); int16x8_t v1604 = vaddq_s16(v1601, v1603); int16x8_t v1605 = vsubq_s16(v1581, v1583); int16x8_t v1606 = vsubq_s16(v1585, v1587); int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073); int16x8_t v1608 = vaddq_s16(v1605, v1607); int16x8_t v1609 = vsubq_s16(v1571, v1573); int16x8_t v1610 = vsubq_s16(v1575, v1577); int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734); int16x8_t v1612 = vaddq_s16(v1609, v1611); int16x8_t v1613 = vsubq_s16(v1561, v1563); int16x8_t v1614 = vsubq_s16(v1565, v1567); int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448); int16x8_t v1616 = vaddq_s16(v1613, v1615); int16x8_t v1617 = vsubq_s16(v1551, v1553); int16x8_t v1618 = vsubq_s16(v1555, v1557); int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220); int16x8_t v1620 = vaddq_s16(v1617, v1619); int16x8_t v1621 = vsubq_s16(v1541, v1543); int16x8_t v1622 = vsubq_s16(v1545, v1547); int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058); int16x8_t v1624 = vaddq_s16(v1621, v1623); int16x8_t v1625 = vsubq_s16(v1531, v1533); int16x8_t v1626 = vsubq_s16(v1535, v1537); int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969); int16x8_t v1628 = vaddq_s16(v1625, v1627); int16x8_t v1629 = vsubq_s16(v1521, v1523); int16x8_t v1630 = vsubq_s16(v1525, v1527); int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961); int16x8_t v1632 = vaddq_s16(v1629, v1631); int16x8_t v1633 = vsubq_s16(v1511, v1513); int16x8_t v1634 = vsubq_s16(v1515, v1517); int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044); int16x8_t v1636 = vaddq_s16(v1633, v1635); int16x8_t v1637 = vsubq_s16(v1501, v1503); int16x8_t v1638 = vsubq_s16(v1505, v1507); int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232); int16x8_t v1640 = vaddq_s16(v1637, v1639); int16x8_t v1641 = vsubq_s16(v1491, v1493); int16x8_t v1642 = vsubq_s16(v1495, v1497); int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538); int16x8_t v1644 = vaddq_s16(v1641, v1643); int16x8_t v1645 = vsubq_s16(v1481, v1483); int16x8_t v1646 = vsubq_s16(v1485, v1487); int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211); int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646); int16x8_t v1648 = vaddq_s16(v1645, v1647); int16x8_t v1649 = vsubq_s16(v1471, v1473); int16x8_t v1650 = vsubq_s16(v1475, v1477); int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808); int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650); int16x8_t v1652 = vaddq_s16(v1649, v1651); int16x8_t v1653 = vsubq_s16(v1461, v1463); int16x8_t v1654 = vsubq_s16(v1465, v1467); int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586); int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654); int16x8_t v1656 = vaddq_s16(v1653, v1655); int16x8_t v1657 = vsubq_s16(v1451, v1453); int16x8_t v1658 = vsubq_s16(v1455, v1457); int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576); int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658); int16x8_t v1660 = vaddq_s16(v1657, v1659); int16x8_t v1661 = vsubq_s16(v1441, v1443); int16x8_t v1662 = vsubq_s16(v1445, v1447); int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817); int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662); int16x8_t v1664 = vaddq_s16(v1661, v1663); int16x8_t v1665 = vsubq_s16(v1422, v1427); int16x8_t v1666 = vsubq_s16(v1432, v1437); int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356); int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666); int16x8_t v1668 = vaddq_s16(v1665, v1667); int16x8_t v1669 = vsubq_s16(v1400, v1405); int16x8_t v1670 = vsubq_s16(v1410, v1415); int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256); int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670); int16x8_t v1672 = vaddq_s16(v1669, v1671); int16x8_t v1673 = vsubq_s16(v1378, v1383); int16x8_t v1674 = vsubq_s16(v1388, v1393); int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596); int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674); int16x8_t v1676 = vaddq_s16(v1673, v1675); int16x8_t v1677 = vsubq_s16(v1356, v1361); int16x8_t v1678 = vsubq_s16(v1366, v1371); int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483); int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678); int16x8_t v1680 = vaddq_s16(v1677, v1679); int16x8_t v1681 = vsubq_s16(v1334, v1339); int16x8_t v1682 = vsubq_s16(v1344, v1349); int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057); int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682); int16x8_t v1684 = vaddq_s16(v1681, v1683); int16x8_t v1685 = vsubq_s16(v1312, v1317); int16x8_t v1686 = vsubq_s16(v1322, v1327); int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517); int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686); int16x8_t v1688 = vaddq_s16(v1685, v1687); int16x8_t v1689 = vsubq_s16(v1290, v1295); int16x8_t v1690 = vsubq_s16(v1300, v1305); int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373); int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2); int16x8_t v1692 = vaddq_s16(v1689, v1691); int16x8_t v1693 = vsubq_s16(v1268, v1273); int16x8_t v1694 = vsubq_s16(v1278, v1283); int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571); int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2); int16x8_t v1696 = vaddq_s16(v1693, v1695); int16x8_t v1697 = vsubq_s16(v1228, v1239); int16x8_t v1698 = vsubq_s16(v1250, v1261); int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975); int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2); int16x8_t v1700 = vaddq_s16(v1697, v1699); int16x8_t v1701 = vsubq_s16(v1182, v1193); int16x8_t v1702 = vsubq_s16(v1204, v1215); int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832); int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3); int16x8_t v1704 = vaddq_s16(v1701, v1703); int16x8_t v1705 = vsubq_s16(v1136, v1147); int16x8_t v1706 = vsubq_s16(v1158, v1169); int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437); int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3); int16x8_t v1708 = vaddq_s16(v1705, v1707); int16x8_t v1709 = vsubq_s16(v1090, v1101); int16x8_t v1710 = vsubq_s16(v1112, v1123); int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573); int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4); int16x8_t v1712 = vaddq_s16(v1709, v1711); int16x8_t v1713 = vsubq_s16(v1008, v1031); int16x8_t v1714 = vsubq_s16(v1054, v1077); int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122); int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5); int16x8_t v1716 = vaddq_s16(v1713, v1715); int16x8_t v1717 = vsubq_s16(v913, v937); int16x8_t v1718 = vsubq_s16(v960, v983); int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041); int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8); int16x8_t v1720 = vaddq_s16(v1717, v1719); int16x8_t v1721 = vsubq_s16(v747, v794); int16x8_t v1722 = vsubq_s16(v841, v888); int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146); int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13); int16x8_t v1724 = vaddq_s16(v1721, v1723); int16x8_t v1725 = vsubq_s16(v141, v316); int16x8_t v1726 = vsubq_s16(v522, v698); int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402); int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40); int16x8_t v1728 = vaddq_s16(v1725, v1727); int16x8_t v1729 = vsubq_s16(v1725, v1727); int16x8_t v1730 = vsubq_s16(v1721, v1723); int16x8_t v1731 = vsubq_s16(v1717, v1719); int16x8_t v1732 = vsubq_s16(v1713, v1715); int16x8_t v1733 = vsubq_s16(v1709, v1711); int16x8_t v1734 = vsubq_s16(v1705, v1707); int16x8_t v1735 = vsubq_s16(v1701, v1703); int16x8_t v1736 = vsubq_s16(v1697, v1699); int16x8_t v1737 = vsubq_s16(v1693, v1695); int16x8_t v1738 = vsubq_s16(v1689, v1691); int16x8_t v1739 = vsubq_s16(v1685, v1687); int16x8_t v1740 = vsubq_s16(v1681, v1683); int16x8_t v1741 = vsubq_s16(v1677, v1679); int16x8_t v1742 = vsubq_s16(v1673, v1675); int16x8_t v1743 = vsubq_s16(v1669, v1671); int16x8_t v1744 = vsubq_s16(v1665, v1667); int16x8_t v1745 = vsubq_s16(v1661, v1663); int16x8_t v1746 = vsubq_s16(v1657, v1659); int16x8_t v1747 = vsubq_s16(v1653, v1655); int16x8_t v1748 = vsubq_s16(v1649, v1651); int16x8_t v1749 = vsubq_s16(v1645, v1647); int16x8_t v1750 = vsubq_s16(v1641, v1643); int16x8_t v1751 = vsubq_s16(v1637, v1639); int16x8_t v1752 = vsubq_s16(v1633, v1635); int16x8_t v1753 = vsubq_s16(v1629, v1631); int16x8_t v1754 = vsubq_s16(v1625, v1627); int16x8_t v1755 = vsubq_s16(v1621, v1623); int16x8_t v1756 = vsubq_s16(v1617, v1619); int16x8_t v1757 = vsubq_s16(v1613, v1615); int16x8_t v1758 = vsubq_s16(v1609, v1611); int16x8_t v1759 = vsubq_s16(v1605, v1607); int16x8_t v1760 = vsubq_s16(v1601, v1603); int16x8_t v1761 = vsubq_s16(v1594, v1599); int16x8_t v1762 = vsubq_s16(v1584, v1589); int16x8_t v1763 = vsubq_s16(v1574, v1579); int16x8_t v1764 = vsubq_s16(v1564, v1569); int16x8_t v1765 = vsubq_s16(v1554, v1559); int16x8_t v1766 = vsubq_s16(v1544, v1549); int16x8_t v1767 = vsubq_s16(v1534, v1539); int16x8_t v1768 = vsubq_s16(v1524, v1529); int16x8_t v1769 = vsubq_s16(v1514, v1519); int16x8_t v1770 = vsubq_s16(v1504, v1509); int16x8_t v1771 = vsubq_s16(v1494, v1499); int16x8_t v1772 = vsubq_s16(v1484, v1489); int16x8_t v1773 = vsubq_s16(v1474, v1479); int16x8_t v1774 = vsubq_s16(v1464, v1469); int16x8_t v1775 = vsubq_s16(v1454, v1459); int16x8_t v1776 = vsubq_s16(v1444, v1449); int16x8_t v1777 = vsubq_s16(v1428, v1439); int16x8_t v1778 = vsubq_s16(v1406, v1417); int16x8_t v1779 = vsubq_s16(v1384, v1395); int16x8_t v1780 = vsubq_s16(v1362, v1373); int16x8_t v1781 = vsubq_s16(v1340, v1351); int16x8_t v1782 = vsubq_s16(v1318, v1329); int16x8_t v1783 = vsubq_s16(v1296, v1307); int16x8_t v1784 = vsubq_s16(v1274, v1285); int16x8_t v1785 = vsubq_s16(v1240, v1263); int16x8_t v1786 = vsubq_s16(v1194, v1217); int16x8_t v1787 = vsubq_s16(v1148, v1171); int16x8_t v1788 = vsubq_s16(v1102, v1125); int16x8_t v1789 = vsubq_s16(v1032, v1079); int16x8_t v1790 = vsubq_s16(v938, v985); int16x8_t v1791 = vsubq_s16(v795, v890); int16x8_t v1792 = vsubq_s16(v317, v700); vst1q_s16(out + out_stride * 0 + i, v701); vst1q_s16(out + out_stride * 1 + i, v891); vst1q_s16(out + out_stride * 2 + i, v986); vst1q_s16(out + out_stride * 3 + i, v1080); vst1q_s16(out + out_stride * 4 + i, v1126); vst1q_s16(out + out_stride * 5 + i, v1172); vst1q_s16(out + out_stride * 6 + i, v1218); vst1q_s16(out + out_stride * 7 + i, v1264); vst1q_s16(out + out_stride * 8 + i, v1286); vst1q_s16(out + out_stride * 9 + i, v1308); vst1q_s16(out + out_stride * 10 + i, v1330); vst1q_s16(out + out_stride * 11 + i, v1352); vst1q_s16(out + out_stride * 12 + i, v1374); vst1q_s16(out + out_stride * 13 + i, v1396); vst1q_s16(out + out_stride * 14 + i, v1418); vst1q_s16(out + out_stride * 15 + i, v1440); vst1q_s16(out + out_stride * 16 + i, v1450); vst1q_s16(out + out_stride * 17 + i, v1460); vst1q_s16(out + out_stride * 18 + i, v1470); vst1q_s16(out + out_stride * 19 + i, v1480); vst1q_s16(out + out_stride * 20 + i, v1490); vst1q_s16(out + out_stride * 21 + i, v1500); vst1q_s16(out + out_stride * 22 + i, v1510); vst1q_s16(out + out_stride * 23 + i, v1520); vst1q_s16(out + out_stride * 24 + i, v1530); vst1q_s16(out + out_stride * 25 + i, v1540); vst1q_s16(out + out_stride * 26 + i, v1550); vst1q_s16(out + out_stride * 27 + i, v1560); vst1q_s16(out + out_stride * 28 + i, v1570); vst1q_s16(out + out_stride * 29 + i, v1580); vst1q_s16(out + out_stride * 30 + i, v1590); vst1q_s16(out + out_stride * 31 + i, v1600); vst1q_s16(out + out_stride * 32 + i, v1604); vst1q_s16(out + out_stride * 33 + i, v1608); vst1q_s16(out + out_stride * 34 + i, v1612); vst1q_s16(out + out_stride * 35 + i, v1616); vst1q_s16(out + out_stride * 36 + i, v1620); vst1q_s16(out + out_stride * 37 + i, v1624); vst1q_s16(out + out_stride * 38 + i, v1628); vst1q_s16(out + out_stride * 39 + i, v1632); vst1q_s16(out + out_stride * 40 + i, v1636); vst1q_s16(out + out_stride * 41 + i, v1640); vst1q_s16(out + out_stride * 42 + i, v1644); vst1q_s16(out + out_stride * 43 + i, v1648); vst1q_s16(out + out_stride * 44 + i, v1652); vst1q_s16(out + out_stride * 45 + i, v1656); vst1q_s16(out + out_stride * 46 + i, v1660); vst1q_s16(out + out_stride * 47 + i, v1664); vst1q_s16(out + out_stride * 48 + i, v1668); vst1q_s16(out + out_stride * 49 + i, v1672); vst1q_s16(out + out_stride * 50 + i, v1676); vst1q_s16(out + out_stride * 51 + i, v1680); vst1q_s16(out + out_stride * 52 + i, v1684); vst1q_s16(out + out_stride * 53 + i, v1688); vst1q_s16(out + out_stride * 54 + i, v1692); vst1q_s16(out + out_stride * 55 + i, v1696); vst1q_s16(out + out_stride * 56 + i, v1700); vst1q_s16(out + out_stride * 57 + i, v1704); vst1q_s16(out + out_stride * 58 + i, v1708); vst1q_s16(out + out_stride * 59 + i, v1712); vst1q_s16(out + out_stride * 60 + i, v1716); vst1q_s16(out + out_stride * 61 + i, v1720); vst1q_s16(out + out_stride * 62 + i, v1724); vst1q_s16(out + out_stride * 63 + i, v1728); vst1q_s16(out + out_stride * 64 + i, v1729); vst1q_s16(out + out_stride * 65 + i, v1730); vst1q_s16(out + out_stride * 66 + i, v1731); vst1q_s16(out + out_stride * 67 + i, v1732); vst1q_s16(out + out_stride * 68 + i, v1733); vst1q_s16(out + out_stride * 69 + i, v1734); vst1q_s16(out + out_stride * 70 + i, v1735); vst1q_s16(out + out_stride * 71 + i, v1736); vst1q_s16(out + out_stride * 72 + i, v1737); vst1q_s16(out + out_stride * 73 + i, v1738); vst1q_s16(out + out_stride * 74 + i, v1739); vst1q_s16(out + out_stride * 75 + i, v1740); vst1q_s16(out + out_stride * 76 + i, v1741); vst1q_s16(out + out_stride * 77 + i, v1742); vst1q_s16(out + out_stride * 78 + i, v1743); vst1q_s16(out + out_stride * 79 + i, v1744); vst1q_s16(out + out_stride * 80 + i, v1745); vst1q_s16(out + out_stride * 81 + i, v1746); vst1q_s16(out + out_stride * 82 + i, v1747); vst1q_s16(out + out_stride * 83 + i, v1748); vst1q_s16(out + out_stride * 84 + i, v1749); vst1q_s16(out + out_stride * 85 + i, v1750); vst1q_s16(out + out_stride * 86 + i, v1751); vst1q_s16(out + out_stride * 87 + i, v1752); vst1q_s16(out + out_stride * 88 + i, v1753); vst1q_s16(out + out_stride * 89 + i, v1754); vst1q_s16(out + out_stride * 90 + i, v1755); vst1q_s16(out + out_stride * 91 + i, v1756); vst1q_s16(out + out_stride * 92 + i, v1757); vst1q_s16(out + out_stride * 93 + i, v1758); vst1q_s16(out + out_stride * 94 + i, v1759); vst1q_s16(out + out_stride * 95 + i, v1760); vst1q_s16(out + out_stride * 96 + i, v1761); vst1q_s16(out + out_stride * 97 + i, v1762); vst1q_s16(out + out_stride * 98 + i, v1763); vst1q_s16(out + out_stride * 99 + i, v1764); vst1q_s16(out + out_stride * 100 + i, v1765); vst1q_s16(out + out_stride * 101 + i, v1766); vst1q_s16(out + out_stride * 102 + i, v1767); vst1q_s16(out + out_stride * 103 + i, v1768); vst1q_s16(out + out_stride * 104 + i, v1769); vst1q_s16(out + out_stride * 105 + i, v1770); vst1q_s16(out + out_stride * 106 + i, v1771); vst1q_s16(out + out_stride * 107 + i, v1772); vst1q_s16(out + out_stride * 108 + i, v1773); vst1q_s16(out + out_stride * 109 + i, v1774); vst1q_s16(out + out_stride * 110 + i, v1775); vst1q_s16(out + out_stride * 111 + i, v1776); vst1q_s16(out + out_stride * 112 + i, v1777); vst1q_s16(out + out_stride * 113 + i, v1778); vst1q_s16(out + out_stride * 114 + i, v1779); vst1q_s16(out + out_stride * 115 + i, v1780); vst1q_s16(out + out_stride * 116 + i, v1781); vst1q_s16(out + out_stride * 117 + i, v1782); vst1q_s16(out + out_stride * 118 + i, v1783); vst1q_s16(out + out_stride * 119 + i, v1784); vst1q_s16(out + out_stride * 120 + i, v1785); vst1q_s16(out + out_stride * 121 + i, v1786); vst1q_s16(out + out_stride * 122 + i, v1787); vst1q_s16(out + out_stride * 123 + i, v1788); vst1q_s16(out + out_stride * 124 + i, v1789); vst1q_s16(out + out_stride * 125 + i, v1790); vst1q_s16(out + out_stride * 126 + i, v1791); vst1q_s16(out + out_stride * 127 + i, v1792); } }