summaryrefslogtreecommitdiffstats
path: root/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h')
-rw-r--r--third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h2137
1 files changed, 2137 insertions, 0 deletions
diff --git a/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
new file mode 100644
index 0000000000..1a94d3ee92
--- /dev/null
+++ b/third_party/jpeg-xl/lib/jxl/fast_dct128-inl.h
@@ -0,0 +1,2137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<128>) { return 2; }
+
+void FastIDCT(FastDCTTag<128>, const int16_t* in, size_t in_stride,
+ int16_t* out, size_t out_stride, size_t count) {
+ JXL_ASSERT(count % 8 == 0);
+ for (size_t i = 0; i < count; i += 8) {
+ int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+ int16x8_t v1 = vld1q_s16(in + in_stride * 64 + i);
+ int16x8_t v2 = vaddq_s16(v0, v1);
+ int16x8_t v3 = vld1q_s16(in + in_stride * 32 + i);
+ int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+ int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+ int16x8_t v5 = vld1q_s16(in + in_stride * 96 + i);
+ int16x8_t v6 = vaddq_s16(v5, v3);
+ int16x8_t v7 = vaddq_s16(v4, v6);
+ int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+ int16x8_t v9 = vaddq_s16(v2, v8);
+ int16x8_t v10 = vld1q_s16(in + in_stride * 16 + i);
+ int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+ int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+ int16x8_t v12 = vld1q_s16(in + in_stride * 80 + i);
+ int16x8_t v13 = vld1q_s16(in + in_stride * 48 + i);
+ int16x8_t v14 = vaddq_s16(v12, v13);
+ int16x8_t v15 = vaddq_s16(v11, v14);
+ int16x8_t v16 = vaddq_s16(v13, v10);
+ int16x8_t v17_tmp = vqrdmulhq_n_s16(v16, 13573);
+ int16x8_t v17 = vaddq_s16(v17_tmp, v16);
+ int16x8_t v18 = vld1q_s16(in + in_stride * 112 + i);
+ int16x8_t v19 = vaddq_s16(v18, v12);
+ int16x8_t v20 = vaddq_s16(v19, v16);
+ int16x8_t v21 = vaddq_s16(v17, v20);
+ int16x8_t v22 = vqrdmulhq_n_s16(v21, 17734);
+ int16x8_t v23 = vaddq_s16(v15, v22);
+ int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+ int16x8_t v25 = vaddq_s16(v9, v24);
+ int16x8_t v26 = vld1q_s16(in + in_stride * 8 + i);
+ int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+ int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+ int16x8_t v28 = vld1q_s16(in + in_stride * 72 + i);
+ int16x8_t v29 = vld1q_s16(in + in_stride * 56 + i);
+ int16x8_t v30 = vaddq_s16(v28, v29);
+ int16x8_t v31 = vaddq_s16(v27, v30);
+ int16x8_t v32 = vld1q_s16(in + in_stride * 40 + i);
+ int16x8_t v33 = vld1q_s16(in + in_stride * 24 + i);
+ int16x8_t v34 = vaddq_s16(v32, v33);
+ int16x8_t v35_tmp = vqrdmulhq_n_s16(v34, 13573);
+ int16x8_t v35 = vaddq_s16(v35_tmp, v34);
+ int16x8_t v36 = vld1q_s16(in + in_stride * 104 + i);
+ int16x8_t v37 = vld1q_s16(in + in_stride * 88 + i);
+ int16x8_t v38 = vaddq_s16(v36, v37);
+ int16x8_t v39 = vaddq_s16(v38, v34);
+ int16x8_t v40 = vaddq_s16(v35, v39);
+ int16x8_t v41 = vqrdmulhq_n_s16(v40, 17734);
+ int16x8_t v42 = vaddq_s16(v31, v41);
+ int16x8_t v43 = vaddq_s16(v33, v26);
+ int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+ int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+ int16x8_t v45 = vaddq_s16(v37, v28);
+ int16x8_t v46 = vaddq_s16(v29, v32);
+ int16x8_t v47 = vaddq_s16(v45, v46);
+ int16x8_t v48 = vaddq_s16(v44, v47);
+ int16x8_t v49 = vaddq_s16(v46, v43);
+ int16x8_t v50_tmp = vqrdmulhq_n_s16(v49, 13573);
+ int16x8_t v50 = vaddq_s16(v50_tmp, v49);
+ int16x8_t v51 = vld1q_s16(in + in_stride * 120 + i);
+ int16x8_t v52 = vaddq_s16(v51, v36);
+ int16x8_t v53 = vaddq_s16(v52, v45);
+ int16x8_t v54 = vaddq_s16(v53, v49);
+ int16x8_t v55 = vaddq_s16(v50, v54);
+ int16x8_t v56 = vqrdmulhq_n_s16(v55, 17734);
+ int16x8_t v57 = vaddq_s16(v48, v56);
+ int16x8_t v58 = vqrdmulhq_n_s16(v57, 16705);
+ int16x8_t v59 = vaddq_s16(v42, v58);
+ int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+ int16x8_t v61 = vaddq_s16(v25, v60);
+ int16x8_t v62 = vld1q_s16(in + in_stride * 4 + i);
+ int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+ int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+ int16x8_t v64 = vld1q_s16(in + in_stride * 68 + i);
+ int16x8_t v65 = vld1q_s16(in + in_stride * 60 + i);
+ int16x8_t v66 = vaddq_s16(v64, v65);
+ int16x8_t v67 = vaddq_s16(v63, v66);
+ int16x8_t v68 = vld1q_s16(in + in_stride * 36 + i);
+ int16x8_t v69 = vld1q_s16(in + in_stride * 28 + i);
+ int16x8_t v70 = vaddq_s16(v68, v69);
+ int16x8_t v71_tmp = vqrdmulhq_n_s16(v70, 13573);
+ int16x8_t v71 = vaddq_s16(v71_tmp, v70);
+ int16x8_t v72 = vld1q_s16(in + in_stride * 100 + i);
+ int16x8_t v73 = vld1q_s16(in + in_stride * 92 + i);
+ int16x8_t v74 = vaddq_s16(v72, v73);
+ int16x8_t v75 = vaddq_s16(v74, v70);
+ int16x8_t v76 = vaddq_s16(v71, v75);
+ int16x8_t v77 = vqrdmulhq_n_s16(v76, 17734);
+ int16x8_t v78 = vaddq_s16(v67, v77);
+ int16x8_t v79 = vld1q_s16(in + in_stride * 20 + i);
+ int16x8_t v80 = vld1q_s16(in + in_stride * 12 + i);
+ int16x8_t v81 = vaddq_s16(v79, v80);
+ int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+ int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+ int16x8_t v83 = vld1q_s16(in + in_stride * 84 + i);
+ int16x8_t v84 = vld1q_s16(in + in_stride * 76 + i);
+ int16x8_t v85 = vaddq_s16(v83, v84);
+ int16x8_t v86 = vld1q_s16(in + in_stride * 52 + i);
+ int16x8_t v87 = vld1q_s16(in + in_stride * 44 + i);
+ int16x8_t v88 = vaddq_s16(v86, v87);
+ int16x8_t v89 = vaddq_s16(v85, v88);
+ int16x8_t v90 = vaddq_s16(v82, v89);
+ int16x8_t v91 = vaddq_s16(v88, v81);
+ int16x8_t v92_tmp = vqrdmulhq_n_s16(v91, 13573);
+ int16x8_t v92 = vaddq_s16(v92_tmp, v91);
+ int16x8_t v93 = vld1q_s16(in + in_stride * 116 + i);
+ int16x8_t v94 = vld1q_s16(in + in_stride * 108 + i);
+ int16x8_t v95 = vaddq_s16(v93, v94);
+ int16x8_t v96 = vaddq_s16(v95, v85);
+ int16x8_t v97 = vaddq_s16(v96, v91);
+ int16x8_t v98 = vaddq_s16(v92, v97);
+ int16x8_t v99 = vqrdmulhq_n_s16(v98, 17734);
+ int16x8_t v100 = vaddq_s16(v90, v99);
+ int16x8_t v101 = vqrdmulhq_n_s16(v100, 16705);
+ int16x8_t v102 = vaddq_s16(v78, v101);
+ int16x8_t v103 = vaddq_s16(v80, v62);
+ int16x8_t v104_tmp = vqrdmulhq_n_s16(v103, 13573);
+ int16x8_t v104 = vaddq_s16(v104_tmp, v103);
+ int16x8_t v105 = vaddq_s16(v84, v64);
+ int16x8_t v106 = vaddq_s16(v65, v86);
+ int16x8_t v107 = vaddq_s16(v105, v106);
+ int16x8_t v108 = vaddq_s16(v104, v107);
+ int16x8_t v109 = vaddq_s16(v87, v68);
+ int16x8_t v110 = vaddq_s16(v69, v79);
+ int16x8_t v111 = vaddq_s16(v109, v110);
+ int16x8_t v112_tmp = vqrdmulhq_n_s16(v111, 13573);
+ int16x8_t v112 = vaddq_s16(v112_tmp, v111);
+ int16x8_t v113 = vaddq_s16(v94, v72);
+ int16x8_t v114 = vaddq_s16(v73, v83);
+ int16x8_t v115 = vaddq_s16(v113, v114);
+ int16x8_t v116 = vaddq_s16(v115, v111);
+ int16x8_t v117 = vaddq_s16(v112, v116);
+ int16x8_t v118 = vqrdmulhq_n_s16(v117, 17734);
+ int16x8_t v119 = vaddq_s16(v108, v118);
+ int16x8_t v120 = vaddq_s16(v110, v103);
+ int16x8_t v121_tmp = vqrdmulhq_n_s16(v120, 13573);
+ int16x8_t v121 = vaddq_s16(v121_tmp, v120);
+ int16x8_t v122 = vaddq_s16(v114, v105);
+ int16x8_t v123 = vaddq_s16(v106, v109);
+ int16x8_t v124 = vaddq_s16(v122, v123);
+ int16x8_t v125 = vaddq_s16(v121, v124);
+ int16x8_t v126 = vaddq_s16(v123, v120);
+ int16x8_t v127_tmp = vqrdmulhq_n_s16(v126, 13573);
+ int16x8_t v127 = vaddq_s16(v127_tmp, v126);
+ int16x8_t v128 = vld1q_s16(in + in_stride * 124 + i);
+ int16x8_t v129 = vaddq_s16(v128, v93);
+ int16x8_t v130 = vaddq_s16(v129, v113);
+ int16x8_t v131 = vaddq_s16(v130, v122);
+ int16x8_t v132 = vaddq_s16(v131, v126);
+ int16x8_t v133 = vaddq_s16(v127, v132);
+ int16x8_t v134 = vqrdmulhq_n_s16(v133, 17734);
+ int16x8_t v135 = vaddq_s16(v125, v134);
+ int16x8_t v136 = vqrdmulhq_n_s16(v135, 16705);
+ int16x8_t v137 = vaddq_s16(v119, v136);
+ int16x8_t v138 = vqrdmulhq_n_s16(v137, 16463);
+ int16x8_t v139 = vaddq_s16(v102, v138);
+ int16x8_t v140 = vqrdmulhq_n_s16(v139, 16404);
+ int16x8_t v141 = vaddq_s16(v61, v140);
+ int16x8_t v142 = vld1q_s16(in + in_stride * 2 + i);
+ int16x8_t v143_tmp = vqrdmulhq_n_s16(v142, 13573);
+ int16x8_t v143 = vaddq_s16(v143_tmp, v142);
+ int16x8_t v144 = vld1q_s16(in + in_stride * 66 + i);
+ int16x8_t v145 = vld1q_s16(in + in_stride * 62 + i);
+ int16x8_t v146 = vaddq_s16(v144, v145);
+ int16x8_t v147 = vaddq_s16(v143, v146);
+ int16x8_t v148 = vld1q_s16(in + in_stride * 34 + i);
+ int16x8_t v149 = vld1q_s16(in + in_stride * 30 + i);
+ int16x8_t v150 = vaddq_s16(v148, v149);
+ int16x8_t v151_tmp = vqrdmulhq_n_s16(v150, 13573);
+ int16x8_t v151 = vaddq_s16(v151_tmp, v150);
+ int16x8_t v152 = vld1q_s16(in + in_stride * 98 + i);
+ int16x8_t v153 = vld1q_s16(in + in_stride * 94 + i);
+ int16x8_t v154 = vaddq_s16(v152, v153);
+ int16x8_t v155 = vaddq_s16(v154, v150);
+ int16x8_t v156 = vaddq_s16(v151, v155);
+ int16x8_t v157 = vqrdmulhq_n_s16(v156, 17734);
+ int16x8_t v158 = vaddq_s16(v147, v157);
+ int16x8_t v159 = vld1q_s16(in + in_stride * 18 + i);
+ int16x8_t v160 = vld1q_s16(in + in_stride * 14 + i);
+ int16x8_t v161 = vaddq_s16(v159, v160);
+ int16x8_t v162_tmp = vqrdmulhq_n_s16(v161, 13573);
+ int16x8_t v162 = vaddq_s16(v162_tmp, v161);
+ int16x8_t v163 = vld1q_s16(in + in_stride * 82 + i);
+ int16x8_t v164 = vld1q_s16(in + in_stride * 78 + i);
+ int16x8_t v165 = vaddq_s16(v163, v164);
+ int16x8_t v166 = vld1q_s16(in + in_stride * 50 + i);
+ int16x8_t v167 = vld1q_s16(in + in_stride * 46 + i);
+ int16x8_t v168 = vaddq_s16(v166, v167);
+ int16x8_t v169 = vaddq_s16(v165, v168);
+ int16x8_t v170 = vaddq_s16(v162, v169);
+ int16x8_t v171 = vaddq_s16(v168, v161);
+ int16x8_t v172_tmp = vqrdmulhq_n_s16(v171, 13573);
+ int16x8_t v172 = vaddq_s16(v172_tmp, v171);
+ int16x8_t v173 = vld1q_s16(in + in_stride * 114 + i);
+ int16x8_t v174 = vld1q_s16(in + in_stride * 110 + i);
+ int16x8_t v175 = vaddq_s16(v173, v174);
+ int16x8_t v176 = vaddq_s16(v175, v165);
+ int16x8_t v177 = vaddq_s16(v176, v171);
+ int16x8_t v178 = vaddq_s16(v172, v177);
+ int16x8_t v179 = vqrdmulhq_n_s16(v178, 17734);
+ int16x8_t v180 = vaddq_s16(v170, v179);
+ int16x8_t v181 = vqrdmulhq_n_s16(v180, 16705);
+ int16x8_t v182 = vaddq_s16(v158, v181);
+ int16x8_t v183 = vld1q_s16(in + in_stride * 10 + i);
+ int16x8_t v184 = vld1q_s16(in + in_stride * 6 + i);
+ int16x8_t v185 = vaddq_s16(v183, v184);
+ int16x8_t v186_tmp = vqrdmulhq_n_s16(v185, 13573);
+ int16x8_t v186 = vaddq_s16(v186_tmp, v185);
+ int16x8_t v187 = vld1q_s16(in + in_stride * 74 + i);
+ int16x8_t v188 = vld1q_s16(in + in_stride * 70 + i);
+ int16x8_t v189 = vaddq_s16(v187, v188);
+ int16x8_t v190 = vld1q_s16(in + in_stride * 58 + i);
+ int16x8_t v191 = vld1q_s16(in + in_stride * 54 + i);
+ int16x8_t v192 = vaddq_s16(v190, v191);
+ int16x8_t v193 = vaddq_s16(v189, v192);
+ int16x8_t v194 = vaddq_s16(v186, v193);
+ int16x8_t v195 = vld1q_s16(in + in_stride * 42 + i);
+ int16x8_t v196 = vld1q_s16(in + in_stride * 38 + i);
+ int16x8_t v197 = vaddq_s16(v195, v196);
+ int16x8_t v198 = vld1q_s16(in + in_stride * 26 + i);
+ int16x8_t v199 = vld1q_s16(in + in_stride * 22 + i);
+ int16x8_t v200 = vaddq_s16(v198, v199);
+ int16x8_t v201 = vaddq_s16(v197, v200);
+ int16x8_t v202_tmp = vqrdmulhq_n_s16(v201, 13573);
+ int16x8_t v202 = vaddq_s16(v202_tmp, v201);
+ int16x8_t v203 = vld1q_s16(in + in_stride * 106 + i);
+ int16x8_t v204 = vld1q_s16(in + in_stride * 102 + i);
+ int16x8_t v205 = vaddq_s16(v203, v204);
+ int16x8_t v206 = vld1q_s16(in + in_stride * 90 + i);
+ int16x8_t v207 = vld1q_s16(in + in_stride * 86 + i);
+ int16x8_t v208 = vaddq_s16(v206, v207);
+ int16x8_t v209 = vaddq_s16(v205, v208);
+ int16x8_t v210 = vaddq_s16(v209, v201);
+ int16x8_t v211 = vaddq_s16(v202, v210);
+ int16x8_t v212 = vqrdmulhq_n_s16(v211, 17734);
+ int16x8_t v213 = vaddq_s16(v194, v212);
+ int16x8_t v214 = vaddq_s16(v200, v185);
+ int16x8_t v215_tmp = vqrdmulhq_n_s16(v214, 13573);
+ int16x8_t v215 = vaddq_s16(v215_tmp, v214);
+ int16x8_t v216 = vaddq_s16(v208, v189);
+ int16x8_t v217 = vaddq_s16(v192, v197);
+ int16x8_t v218 = vaddq_s16(v216, v217);
+ int16x8_t v219 = vaddq_s16(v215, v218);
+ int16x8_t v220 = vaddq_s16(v217, v214);
+ int16x8_t v221_tmp = vqrdmulhq_n_s16(v220, 13573);
+ int16x8_t v221 = vaddq_s16(v221_tmp, v220);
+ int16x8_t v222 = vld1q_s16(in + in_stride * 122 + i);
+ int16x8_t v223 = vld1q_s16(in + in_stride * 118 + i);
+ int16x8_t v224 = vaddq_s16(v222, v223);
+ int16x8_t v225 = vaddq_s16(v224, v205);
+ int16x8_t v226 = vaddq_s16(v225, v216);
+ int16x8_t v227 = vaddq_s16(v226, v220);
+ int16x8_t v228 = vaddq_s16(v221, v227);
+ int16x8_t v229 = vqrdmulhq_n_s16(v228, 17734);
+ int16x8_t v230 = vaddq_s16(v219, v229);
+ int16x8_t v231 = vqrdmulhq_n_s16(v230, 16705);
+ int16x8_t v232 = vaddq_s16(v213, v231);
+ int16x8_t v233 = vqrdmulhq_n_s16(v232, 16463);
+ int16x8_t v234 = vaddq_s16(v182, v233);
+ int16x8_t v235 = vaddq_s16(v184, v142);
+ int16x8_t v236_tmp = vqrdmulhq_n_s16(v235, 13573);
+ int16x8_t v236 = vaddq_s16(v236_tmp, v235);
+ int16x8_t v237 = vaddq_s16(v188, v144);
+ int16x8_t v238 = vaddq_s16(v145, v190);
+ int16x8_t v239 = vaddq_s16(v237, v238);
+ int16x8_t v240 = vaddq_s16(v236, v239);
+ int16x8_t v241 = vaddq_s16(v196, v148);
+ int16x8_t v242 = vaddq_s16(v149, v198);
+ int16x8_t v243 = vaddq_s16(v241, v242);
+ int16x8_t v244_tmp = vqrdmulhq_n_s16(v243, 13573);
+ int16x8_t v244 = vaddq_s16(v244_tmp, v243);
+ int16x8_t v245 = vaddq_s16(v204, v152);
+ int16x8_t v246 = vaddq_s16(v153, v206);
+ int16x8_t v247 = vaddq_s16(v245, v246);
+ int16x8_t v248 = vaddq_s16(v247, v243);
+ int16x8_t v249 = vaddq_s16(v244, v248);
+ int16x8_t v250 = vqrdmulhq_n_s16(v249, 17734);
+ int16x8_t v251 = vaddq_s16(v240, v250);
+ int16x8_t v252 = vaddq_s16(v199, v159);
+ int16x8_t v253 = vaddq_s16(v160, v183);
+ int16x8_t v254 = vaddq_s16(v252, v253);
+ int16x8_t v255_tmp = vqrdmulhq_n_s16(v254, 13573);
+ int16x8_t v255 = vaddq_s16(v255_tmp, v254);
+ int16x8_t v256 = vaddq_s16(v207, v163);
+ int16x8_t v257 = vaddq_s16(v164, v187);
+ int16x8_t v258 = vaddq_s16(v256, v257);
+ int16x8_t v259 = vaddq_s16(v191, v166);
+ int16x8_t v260 = vaddq_s16(v167, v195);
+ int16x8_t v261 = vaddq_s16(v259, v260);
+ int16x8_t v262 = vaddq_s16(v258, v261);
+ int16x8_t v263 = vaddq_s16(v255, v262);
+ int16x8_t v264 = vaddq_s16(v261, v254);
+ int16x8_t v265_tmp = vqrdmulhq_n_s16(v264, 13573);
+ int16x8_t v265 = vaddq_s16(v265_tmp, v264);
+ int16x8_t v266 = vaddq_s16(v223, v173);
+ int16x8_t v267 = vaddq_s16(v174, v203);
+ int16x8_t v268 = vaddq_s16(v266, v267);
+ int16x8_t v269 = vaddq_s16(v268, v258);
+ int16x8_t v270 = vaddq_s16(v269, v264);
+ int16x8_t v271 = vaddq_s16(v265, v270);
+ int16x8_t v272 = vqrdmulhq_n_s16(v271, 17734);
+ int16x8_t v273 = vaddq_s16(v263, v272);
+ int16x8_t v274 = vqrdmulhq_n_s16(v273, 16705);
+ int16x8_t v275 = vaddq_s16(v251, v274);
+ int16x8_t v276 = vaddq_s16(v253, v235);
+ int16x8_t v277_tmp = vqrdmulhq_n_s16(v276, 13573);
+ int16x8_t v277 = vaddq_s16(v277_tmp, v276);
+ int16x8_t v278 = vaddq_s16(v257, v237);
+ int16x8_t v279 = vaddq_s16(v238, v259);
+ int16x8_t v280 = vaddq_s16(v278, v279);
+ int16x8_t v281 = vaddq_s16(v277, v280);
+ int16x8_t v282 = vaddq_s16(v260, v241);
+ int16x8_t v283 = vaddq_s16(v242, v252);
+ int16x8_t v284 = vaddq_s16(v282, v283);
+ int16x8_t v285_tmp = vqrdmulhq_n_s16(v284, 13573);
+ int16x8_t v285 = vaddq_s16(v285_tmp, v284);
+ int16x8_t v286 = vaddq_s16(v267, v245);
+ int16x8_t v287 = vaddq_s16(v246, v256);
+ int16x8_t v288 = vaddq_s16(v286, v287);
+ int16x8_t v289 = vaddq_s16(v288, v284);
+ int16x8_t v290 = vaddq_s16(v285, v289);
+ int16x8_t v291 = vqrdmulhq_n_s16(v290, 17734);
+ int16x8_t v292 = vaddq_s16(v281, v291);
+ int16x8_t v293 = vaddq_s16(v283, v276);
+ int16x8_t v294_tmp = vqrdmulhq_n_s16(v293, 13573);
+ int16x8_t v294 = vaddq_s16(v294_tmp, v293);
+ int16x8_t v295 = vaddq_s16(v287, v278);
+ int16x8_t v296 = vaddq_s16(v279, v282);
+ int16x8_t v297 = vaddq_s16(v295, v296);
+ int16x8_t v298 = vaddq_s16(v294, v297);
+ int16x8_t v299 = vaddq_s16(v296, v293);
+ int16x8_t v300_tmp = vqrdmulhq_n_s16(v299, 13573);
+ int16x8_t v300 = vaddq_s16(v300_tmp, v299);
+ int16x8_t v301 = vld1q_s16(in + in_stride * 126 + i);
+ int16x8_t v302 = vaddq_s16(v301, v222);
+ int16x8_t v303 = vaddq_s16(v302, v266);
+ int16x8_t v304 = vaddq_s16(v303, v286);
+ int16x8_t v305 = vaddq_s16(v304, v295);
+ int16x8_t v306 = vaddq_s16(v305, v299);
+ int16x8_t v307 = vaddq_s16(v300, v306);
+ int16x8_t v308 = vqrdmulhq_n_s16(v307, 17734);
+ int16x8_t v309 = vaddq_s16(v298, v308);
+ int16x8_t v310 = vqrdmulhq_n_s16(v309, 16705);
+ int16x8_t v311 = vaddq_s16(v292, v310);
+ int16x8_t v312 = vqrdmulhq_n_s16(v311, 16463);
+ int16x8_t v313 = vaddq_s16(v275, v312);
+ int16x8_t v314 = vqrdmulhq_n_s16(v313, 16404);
+ int16x8_t v315 = vaddq_s16(v234, v314);
+ int16x8_t v316 = vqrdmulhq_n_s16(v315, 16389);
+ int16x8_t v317 = vaddq_s16(v141, v316);
+ int16x8_t v318 = vld1q_s16(in + in_stride * 1 + i);
+ int16x8_t v319_tmp = vqrdmulhq_n_s16(v318, 13573);
+ int16x8_t v319 = vaddq_s16(v319_tmp, v318);
+ int16x8_t v320 = vld1q_s16(in + in_stride * 65 + i);
+ int16x8_t v321 = vld1q_s16(in + in_stride * 63 + i);
+ int16x8_t v322 = vaddq_s16(v320, v321);
+ int16x8_t v323 = vaddq_s16(v319, v322);
+ int16x8_t v324 = vld1q_s16(in + in_stride * 33 + i);
+ int16x8_t v325 = vld1q_s16(in + in_stride * 31 + i);
+ int16x8_t v326 = vaddq_s16(v324, v325);
+ int16x8_t v327_tmp = vqrdmulhq_n_s16(v326, 13573);
+ int16x8_t v327 = vaddq_s16(v327_tmp, v326);
+ int16x8_t v328 = vld1q_s16(in + in_stride * 97 + i);
+ int16x8_t v329 = vld1q_s16(in + in_stride * 95 + i);
+ int16x8_t v330 = vaddq_s16(v328, v329);
+ int16x8_t v331 = vaddq_s16(v330, v326);
+ int16x8_t v332 = vaddq_s16(v327, v331);
+ int16x8_t v333 = vqrdmulhq_n_s16(v332, 17734);
+ int16x8_t v334 = vaddq_s16(v323, v333);
+ int16x8_t v335 = vld1q_s16(in + in_stride * 17 + i);
+ int16x8_t v336 = vld1q_s16(in + in_stride * 15 + i);
+ int16x8_t v337 = vaddq_s16(v335, v336);
+ int16x8_t v338_tmp = vqrdmulhq_n_s16(v337, 13573);
+ int16x8_t v338 = vaddq_s16(v338_tmp, v337);
+ int16x8_t v339 = vld1q_s16(in + in_stride * 81 + i);
+ int16x8_t v340 = vld1q_s16(in + in_stride * 79 + i);
+ int16x8_t v341 = vaddq_s16(v339, v340);
+ int16x8_t v342 = vld1q_s16(in + in_stride * 49 + i);
+ int16x8_t v343 = vld1q_s16(in + in_stride * 47 + i);
+ int16x8_t v344 = vaddq_s16(v342, v343);
+ int16x8_t v345 = vaddq_s16(v341, v344);
+ int16x8_t v346 = vaddq_s16(v338, v345);
+ int16x8_t v347 = vaddq_s16(v344, v337);
+ int16x8_t v348_tmp = vqrdmulhq_n_s16(v347, 13573);
+ int16x8_t v348 = vaddq_s16(v348_tmp, v347);
+ int16x8_t v349 = vld1q_s16(in + in_stride * 113 + i);
+ int16x8_t v350 = vld1q_s16(in + in_stride * 111 + i);
+ int16x8_t v351 = vaddq_s16(v349, v350);
+ int16x8_t v352 = vaddq_s16(v351, v341);
+ int16x8_t v353 = vaddq_s16(v352, v347);
+ int16x8_t v354 = vaddq_s16(v348, v353);
+ int16x8_t v355 = vqrdmulhq_n_s16(v354, 17734);
+ int16x8_t v356 = vaddq_s16(v346, v355);
+ int16x8_t v357 = vqrdmulhq_n_s16(v356, 16705);
+ int16x8_t v358 = vaddq_s16(v334, v357);
+ int16x8_t v359 = vld1q_s16(in + in_stride * 9 + i);
+ int16x8_t v360 = vld1q_s16(in + in_stride * 7 + i);
+ int16x8_t v361 = vaddq_s16(v359, v360);
+ int16x8_t v362_tmp = vqrdmulhq_n_s16(v361, 13573);
+ int16x8_t v362 = vaddq_s16(v362_tmp, v361);
+ int16x8_t v363 = vld1q_s16(in + in_stride * 73 + i);
+ int16x8_t v364 = vld1q_s16(in + in_stride * 71 + i);
+ int16x8_t v365 = vaddq_s16(v363, v364);
+ int16x8_t v366 = vld1q_s16(in + in_stride * 57 + i);
+ int16x8_t v367 = vld1q_s16(in + in_stride * 55 + i);
+ int16x8_t v368 = vaddq_s16(v366, v367);
+ int16x8_t v369 = vaddq_s16(v365, v368);
+ int16x8_t v370 = vaddq_s16(v362, v369);
+ int16x8_t v371 = vld1q_s16(in + in_stride * 41 + i);
+ int16x8_t v372 = vld1q_s16(in + in_stride * 39 + i);
+ int16x8_t v373 = vaddq_s16(v371, v372);
+ int16x8_t v374 = vld1q_s16(in + in_stride * 25 + i);
+ int16x8_t v375 = vld1q_s16(in + in_stride * 23 + i);
+ int16x8_t v376 = vaddq_s16(v374, v375);
+ int16x8_t v377 = vaddq_s16(v373, v376);
+ int16x8_t v378_tmp = vqrdmulhq_n_s16(v377, 13573);
+ int16x8_t v378 = vaddq_s16(v378_tmp, v377);
+ int16x8_t v379 = vld1q_s16(in + in_stride * 105 + i);
+ int16x8_t v380 = vld1q_s16(in + in_stride * 103 + i);
+ int16x8_t v381 = vaddq_s16(v379, v380);
+ int16x8_t v382 = vld1q_s16(in + in_stride * 89 + i);
+ int16x8_t v383 = vld1q_s16(in + in_stride * 87 + i);
+ int16x8_t v384 = vaddq_s16(v382, v383);
+ int16x8_t v385 = vaddq_s16(v381, v384);
+ int16x8_t v386 = vaddq_s16(v385, v377);
+ int16x8_t v387 = vaddq_s16(v378, v386);
+ int16x8_t v388 = vqrdmulhq_n_s16(v387, 17734);
+ int16x8_t v389 = vaddq_s16(v370, v388);
+ int16x8_t v390 = vaddq_s16(v376, v361);
+ int16x8_t v391_tmp = vqrdmulhq_n_s16(v390, 13573);
+ int16x8_t v391 = vaddq_s16(v391_tmp, v390);
+ int16x8_t v392 = vaddq_s16(v384, v365);
+ int16x8_t v393 = vaddq_s16(v368, v373);
+ int16x8_t v394 = vaddq_s16(v392, v393);
+ int16x8_t v395 = vaddq_s16(v391, v394);
+ int16x8_t v396 = vaddq_s16(v393, v390);
+ int16x8_t v397_tmp = vqrdmulhq_n_s16(v396, 13573);
+ int16x8_t v397 = vaddq_s16(v397_tmp, v396);
+ int16x8_t v398 = vld1q_s16(in + in_stride * 121 + i);
+ int16x8_t v399 = vld1q_s16(in + in_stride * 119 + i);
+ int16x8_t v400 = vaddq_s16(v398, v399);
+ int16x8_t v401 = vaddq_s16(v400, v381);
+ int16x8_t v402 = vaddq_s16(v401, v392);
+ int16x8_t v403 = vaddq_s16(v402, v396);
+ int16x8_t v404 = vaddq_s16(v397, v403);
+ int16x8_t v405 = vqrdmulhq_n_s16(v404, 17734);
+ int16x8_t v406 = vaddq_s16(v395, v405);
+ int16x8_t v407 = vqrdmulhq_n_s16(v406, 16705);
+ int16x8_t v408 = vaddq_s16(v389, v407);
+ int16x8_t v409 = vqrdmulhq_n_s16(v408, 16463);
+ int16x8_t v410 = vaddq_s16(v358, v409);
+ int16x8_t v411 = vld1q_s16(in + in_stride * 5 + i);
+ int16x8_t v412 = vld1q_s16(in + in_stride * 3 + i);
+ int16x8_t v413 = vaddq_s16(v411, v412);
+ int16x8_t v414_tmp = vqrdmulhq_n_s16(v413, 13573);
+ int16x8_t v414 = vaddq_s16(v414_tmp, v413);
+ int16x8_t v415 = vld1q_s16(in + in_stride * 69 + i);
+ int16x8_t v416 = vld1q_s16(in + in_stride * 67 + i);
+ int16x8_t v417 = vaddq_s16(v415, v416);
+ int16x8_t v418 = vld1q_s16(in + in_stride * 61 + i);
+ int16x8_t v419 = vld1q_s16(in + in_stride * 59 + i);
+ int16x8_t v420 = vaddq_s16(v418, v419);
+ int16x8_t v421 = vaddq_s16(v417, v420);
+ int16x8_t v422 = vaddq_s16(v414, v421);
+ int16x8_t v423 = vld1q_s16(in + in_stride * 37 + i);
+ int16x8_t v424 = vld1q_s16(in + in_stride * 35 + i);
+ int16x8_t v425 = vaddq_s16(v423, v424);
+ int16x8_t v426 = vld1q_s16(in + in_stride * 29 + i);
+ int16x8_t v427 = vld1q_s16(in + in_stride * 27 + i);
+ int16x8_t v428 = vaddq_s16(v426, v427);
+ int16x8_t v429 = vaddq_s16(v425, v428);
+ int16x8_t v430_tmp = vqrdmulhq_n_s16(v429, 13573);
+ int16x8_t v430 = vaddq_s16(v430_tmp, v429);
+ int16x8_t v431 = vld1q_s16(in + in_stride * 101 + i);
+ int16x8_t v432 = vld1q_s16(in + in_stride * 99 + i);
+ int16x8_t v433 = vaddq_s16(v431, v432);
+ int16x8_t v434 = vld1q_s16(in + in_stride * 93 + i);
+ int16x8_t v435 = vld1q_s16(in + in_stride * 91 + i);
+ int16x8_t v436 = vaddq_s16(v434, v435);
+ int16x8_t v437 = vaddq_s16(v433, v436);
+ int16x8_t v438 = vaddq_s16(v437, v429);
+ int16x8_t v439 = vaddq_s16(v430, v438);
+ int16x8_t v440 = vqrdmulhq_n_s16(v439, 17734);
+ int16x8_t v441 = vaddq_s16(v422, v440);
+ int16x8_t v442 = vld1q_s16(in + in_stride * 21 + i);
+ int16x8_t v443 = vld1q_s16(in + in_stride * 19 + i);
+ int16x8_t v444 = vaddq_s16(v442, v443);
+ int16x8_t v445 = vld1q_s16(in + in_stride * 13 + i);
+ int16x8_t v446 = vld1q_s16(in + in_stride * 11 + i);
+ int16x8_t v447 = vaddq_s16(v445, v446);
+ int16x8_t v448 = vaddq_s16(v444, v447);
+ int16x8_t v449_tmp = vqrdmulhq_n_s16(v448, 13573);
+ int16x8_t v449 = vaddq_s16(v449_tmp, v448);
+ int16x8_t v450 = vld1q_s16(in + in_stride * 85 + i);
+ int16x8_t v451 = vld1q_s16(in + in_stride * 83 + i);
+ int16x8_t v452 = vaddq_s16(v450, v451);
+ int16x8_t v453 = vld1q_s16(in + in_stride * 77 + i);
+ int16x8_t v454 = vld1q_s16(in + in_stride * 75 + i);
+ int16x8_t v455 = vaddq_s16(v453, v454);
+ int16x8_t v456 = vaddq_s16(v452, v455);
+ int16x8_t v457 = vld1q_s16(in + in_stride * 53 + i);
+ int16x8_t v458 = vld1q_s16(in + in_stride * 51 + i);
+ int16x8_t v459 = vaddq_s16(v457, v458);
+ int16x8_t v460 = vld1q_s16(in + in_stride * 45 + i);
+ int16x8_t v461 = vld1q_s16(in + in_stride * 43 + i);
+ int16x8_t v462 = vaddq_s16(v460, v461);
+ int16x8_t v463 = vaddq_s16(v459, v462);
+ int16x8_t v464 = vaddq_s16(v456, v463);
+ int16x8_t v465 = vaddq_s16(v449, v464);
+ int16x8_t v466 = vaddq_s16(v463, v448);
+ int16x8_t v467_tmp = vqrdmulhq_n_s16(v466, 13573);
+ int16x8_t v467 = vaddq_s16(v467_tmp, v466);
+ int16x8_t v468 = vld1q_s16(in + in_stride * 117 + i);
+ int16x8_t v469 = vld1q_s16(in + in_stride * 115 + i);
+ int16x8_t v470 = vaddq_s16(v468, v469);
+ int16x8_t v471 = vld1q_s16(in + in_stride * 109 + i);
+ int16x8_t v472 = vld1q_s16(in + in_stride * 107 + i);
+ int16x8_t v473 = vaddq_s16(v471, v472);
+ int16x8_t v474 = vaddq_s16(v470, v473);
+ int16x8_t v475 = vaddq_s16(v474, v456);
+ int16x8_t v476 = vaddq_s16(v475, v466);
+ int16x8_t v477 = vaddq_s16(v467, v476);
+ int16x8_t v478 = vqrdmulhq_n_s16(v477, 17734);
+ int16x8_t v479 = vaddq_s16(v465, v478);
+ int16x8_t v480 = vqrdmulhq_n_s16(v479, 16705);
+ int16x8_t v481 = vaddq_s16(v441, v480);
+ int16x8_t v482 = vaddq_s16(v447, v413);
+ int16x8_t v483_tmp = vqrdmulhq_n_s16(v482, 13573);
+ int16x8_t v483 = vaddq_s16(v483_tmp, v482);
+ int16x8_t v484 = vaddq_s16(v455, v417);
+ int16x8_t v485 = vaddq_s16(v420, v459);
+ int16x8_t v486 = vaddq_s16(v484, v485);
+ int16x8_t v487 = vaddq_s16(v483, v486);
+ int16x8_t v488 = vaddq_s16(v462, v425);
+ int16x8_t v489 = vaddq_s16(v428, v444);
+ int16x8_t v490 = vaddq_s16(v488, v489);
+ int16x8_t v491_tmp = vqrdmulhq_n_s16(v490, 13573);
+ int16x8_t v491 = vaddq_s16(v491_tmp, v490);
+ int16x8_t v492 = vaddq_s16(v473, v433);
+ int16x8_t v493 = vaddq_s16(v436, v452);
+ int16x8_t v494 = vaddq_s16(v492, v493);
+ int16x8_t v495 = vaddq_s16(v494, v490);
+ int16x8_t v496 = vaddq_s16(v491, v495);
+ int16x8_t v497 = vqrdmulhq_n_s16(v496, 17734);
+ int16x8_t v498 = vaddq_s16(v487, v497);
+ int16x8_t v499 = vaddq_s16(v489, v482);
+ int16x8_t v500_tmp = vqrdmulhq_n_s16(v499, 13573);
+ int16x8_t v500 = vaddq_s16(v500_tmp, v499);
+ int16x8_t v501 = vaddq_s16(v493, v484);
+ int16x8_t v502 = vaddq_s16(v485, v488);
+ int16x8_t v503 = vaddq_s16(v501, v502);
+ int16x8_t v504 = vaddq_s16(v500, v503);
+ int16x8_t v505 = vaddq_s16(v502, v499);
+ int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 13573);
+ int16x8_t v506 = vaddq_s16(v506_tmp, v505);
+ int16x8_t v507 = vld1q_s16(in + in_stride * 125 + i);
+ int16x8_t v508 = vld1q_s16(in + in_stride * 123 + i);
+ int16x8_t v509 = vaddq_s16(v507, v508);
+ int16x8_t v510 = vaddq_s16(v509, v470);
+ int16x8_t v511 = vaddq_s16(v510, v492);
+ int16x8_t v512 = vaddq_s16(v511, v501);
+ int16x8_t v513 = vaddq_s16(v512, v505);
+ int16x8_t v514 = vaddq_s16(v506, v513);
+ int16x8_t v515 = vqrdmulhq_n_s16(v514, 17734);
+ int16x8_t v516 = vaddq_s16(v504, v515);
+ int16x8_t v517 = vqrdmulhq_n_s16(v516, 16705);
+ int16x8_t v518 = vaddq_s16(v498, v517);
+ int16x8_t v519 = vqrdmulhq_n_s16(v518, 16463);
+ int16x8_t v520 = vaddq_s16(v481, v519);
+ int16x8_t v521 = vqrdmulhq_n_s16(v520, 16404);
+ int16x8_t v522 = vaddq_s16(v410, v521);
+ int16x8_t v523 = vaddq_s16(v412, v318);
+ int16x8_t v524_tmp = vqrdmulhq_n_s16(v523, 13573);
+ int16x8_t v524 = vaddq_s16(v524_tmp, v523);
+ int16x8_t v525 = vaddq_s16(v416, v320);
+ int16x8_t v526 = vaddq_s16(v321, v418);
+ int16x8_t v527 = vaddq_s16(v525, v526);
+ int16x8_t v528 = vaddq_s16(v524, v527);
+ int16x8_t v529 = vaddq_s16(v424, v324);
+ int16x8_t v530 = vaddq_s16(v325, v426);
+ int16x8_t v531 = vaddq_s16(v529, v530);
+ int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 13573);
+ int16x8_t v532 = vaddq_s16(v532_tmp, v531);
+ int16x8_t v533 = vaddq_s16(v432, v328);
+ int16x8_t v534 = vaddq_s16(v329, v434);
+ int16x8_t v535 = vaddq_s16(v533, v534);
+ int16x8_t v536 = vaddq_s16(v535, v531);
+ int16x8_t v537 = vaddq_s16(v532, v536);
+ int16x8_t v538 = vqrdmulhq_n_s16(v537, 17734);
+ int16x8_t v539 = vaddq_s16(v528, v538);
+ int16x8_t v540 = vaddq_s16(v443, v335);
+ int16x8_t v541 = vaddq_s16(v336, v445);
+ int16x8_t v542 = vaddq_s16(v540, v541);
+ int16x8_t v543_tmp = vqrdmulhq_n_s16(v542, 13573);
+ int16x8_t v543 = vaddq_s16(v543_tmp, v542);
+ int16x8_t v544 = vaddq_s16(v451, v339);
+ int16x8_t v545 = vaddq_s16(v340, v453);
+ int16x8_t v546 = vaddq_s16(v544, v545);
+ int16x8_t v547 = vaddq_s16(v458, v342);
+ int16x8_t v548 = vaddq_s16(v343, v460);
+ int16x8_t v549 = vaddq_s16(v547, v548);
+ int16x8_t v550 = vaddq_s16(v546, v549);
+ int16x8_t v551 = vaddq_s16(v543, v550);
+ int16x8_t v552 = vaddq_s16(v549, v542);
+ int16x8_t v553_tmp = vqrdmulhq_n_s16(v552, 13573);
+ int16x8_t v553 = vaddq_s16(v553_tmp, v552);
+ int16x8_t v554 = vaddq_s16(v469, v349);
+ int16x8_t v555 = vaddq_s16(v350, v471);
+ int16x8_t v556 = vaddq_s16(v554, v555);
+ int16x8_t v557 = vaddq_s16(v556, v546);
+ int16x8_t v558 = vaddq_s16(v557, v552);
+ int16x8_t v559 = vaddq_s16(v553, v558);
+ int16x8_t v560 = vqrdmulhq_n_s16(v559, 17734);
+ int16x8_t v561 = vaddq_s16(v551, v560);
+ int16x8_t v562 = vqrdmulhq_n_s16(v561, 16705);
+ int16x8_t v563 = vaddq_s16(v539, v562);
+ int16x8_t v564 = vaddq_s16(v446, v359);
+ int16x8_t v565 = vaddq_s16(v360, v411);
+ int16x8_t v566 = vaddq_s16(v564, v565);
+ int16x8_t v567_tmp = vqrdmulhq_n_s16(v566, 13573);
+ int16x8_t v567 = vaddq_s16(v567_tmp, v566);
+ int16x8_t v568 = vaddq_s16(v454, v363);
+ int16x8_t v569 = vaddq_s16(v364, v415);
+ int16x8_t v570 = vaddq_s16(v568, v569);
+ int16x8_t v571 = vaddq_s16(v419, v366);
+ int16x8_t v572 = vaddq_s16(v367, v457);
+ int16x8_t v573 = vaddq_s16(v571, v572);
+ int16x8_t v574 = vaddq_s16(v570, v573);
+ int16x8_t v575 = vaddq_s16(v567, v574);
+ int16x8_t v576 = vaddq_s16(v461, v371);
+ int16x8_t v577 = vaddq_s16(v372, v423);
+ int16x8_t v578 = vaddq_s16(v576, v577);
+ int16x8_t v579 = vaddq_s16(v427, v374);
+ int16x8_t v580 = vaddq_s16(v375, v442);
+ int16x8_t v581 = vaddq_s16(v579, v580);
+ int16x8_t v582 = vaddq_s16(v578, v581);
+ int16x8_t v583_tmp = vqrdmulhq_n_s16(v582, 13573);
+ int16x8_t v583 = vaddq_s16(v583_tmp, v582);
+ int16x8_t v584 = vaddq_s16(v472, v379);
+ int16x8_t v585 = vaddq_s16(v380, v431);
+ int16x8_t v586 = vaddq_s16(v584, v585);
+ int16x8_t v587 = vaddq_s16(v435, v382);
+ int16x8_t v588 = vaddq_s16(v383, v450);
+ int16x8_t v589 = vaddq_s16(v587, v588);
+ int16x8_t v590 = vaddq_s16(v586, v589);
+ int16x8_t v591 = vaddq_s16(v590, v582);
+ int16x8_t v592 = vaddq_s16(v583, v591);
+ int16x8_t v593 = vqrdmulhq_n_s16(v592, 17734);
+ int16x8_t v594 = vaddq_s16(v575, v593);
+ int16x8_t v595 = vaddq_s16(v581, v566);
+ int16x8_t v596_tmp = vqrdmulhq_n_s16(v595, 13573);
+ int16x8_t v596 = vaddq_s16(v596_tmp, v595);
+ int16x8_t v597 = vaddq_s16(v589, v570);
+ int16x8_t v598 = vaddq_s16(v573, v578);
+ int16x8_t v599 = vaddq_s16(v597, v598);
+ int16x8_t v600 = vaddq_s16(v596, v599);
+ int16x8_t v601 = vaddq_s16(v598, v595);
+ int16x8_t v602_tmp = vqrdmulhq_n_s16(v601, 13573);
+ int16x8_t v602 = vaddq_s16(v602_tmp, v601);
+ int16x8_t v603 = vaddq_s16(v508, v398);
+ int16x8_t v604 = vaddq_s16(v399, v468);
+ int16x8_t v605 = vaddq_s16(v603, v604);
+ int16x8_t v606 = vaddq_s16(v605, v586);
+ int16x8_t v607 = vaddq_s16(v606, v597);
+ int16x8_t v608 = vaddq_s16(v607, v601);
+ int16x8_t v609 = vaddq_s16(v602, v608);
+ int16x8_t v610 = vqrdmulhq_n_s16(v609, 17734);
+ int16x8_t v611 = vaddq_s16(v600, v610);
+ int16x8_t v612 = vqrdmulhq_n_s16(v611, 16705);
+ int16x8_t v613 = vaddq_s16(v594, v612);
+ int16x8_t v614 = vqrdmulhq_n_s16(v613, 16463);
+ int16x8_t v615 = vaddq_s16(v563, v614);
+ int16x8_t v616 = vaddq_s16(v565, v523);
+ int16x8_t v617_tmp = vqrdmulhq_n_s16(v616, 13573);
+ int16x8_t v617 = vaddq_s16(v617_tmp, v616);
+ int16x8_t v618 = vaddq_s16(v569, v525);
+ int16x8_t v619 = vaddq_s16(v526, v571);
+ int16x8_t v620 = vaddq_s16(v618, v619);
+ int16x8_t v621 = vaddq_s16(v617, v620);
+ int16x8_t v622 = vaddq_s16(v577, v529);
+ int16x8_t v623 = vaddq_s16(v530, v579);
+ int16x8_t v624 = vaddq_s16(v622, v623);
+ int16x8_t v625_tmp = vqrdmulhq_n_s16(v624, 13573);
+ int16x8_t v625 = vaddq_s16(v625_tmp, v624);
+ int16x8_t v626 = vaddq_s16(v585, v533);
+ int16x8_t v627 = vaddq_s16(v534, v587);
+ int16x8_t v628 = vaddq_s16(v626, v627);
+ int16x8_t v629 = vaddq_s16(v628, v624);
+ int16x8_t v630 = vaddq_s16(v625, v629);
+ int16x8_t v631 = vqrdmulhq_n_s16(v630, 17734);
+ int16x8_t v632 = vaddq_s16(v621, v631);
+ int16x8_t v633 = vaddq_s16(v580, v540);
+ int16x8_t v634 = vaddq_s16(v541, v564);
+ int16x8_t v635 = vaddq_s16(v633, v634);
+ int16x8_t v636_tmp = vqrdmulhq_n_s16(v635, 13573);
+ int16x8_t v636 = vaddq_s16(v636_tmp, v635);
+ int16x8_t v637 = vaddq_s16(v588, v544);
+ int16x8_t v638 = vaddq_s16(v545, v568);
+ int16x8_t v639 = vaddq_s16(v637, v638);
+ int16x8_t v640 = vaddq_s16(v572, v547);
+ int16x8_t v641 = vaddq_s16(v548, v576);
+ int16x8_t v642 = vaddq_s16(v640, v641);
+ int16x8_t v643 = vaddq_s16(v639, v642);
+ int16x8_t v644 = vaddq_s16(v636, v643);
+ int16x8_t v645 = vaddq_s16(v642, v635);
+ int16x8_t v646_tmp = vqrdmulhq_n_s16(v645, 13573);
+ int16x8_t v646 = vaddq_s16(v646_tmp, v645);
+ int16x8_t v647 = vaddq_s16(v604, v554);
+ int16x8_t v648 = vaddq_s16(v555, v584);
+ int16x8_t v649 = vaddq_s16(v647, v648);
+ int16x8_t v650 = vaddq_s16(v649, v639);
+ int16x8_t v651 = vaddq_s16(v650, v645);
+ int16x8_t v652 = vaddq_s16(v646, v651);
+ int16x8_t v653 = vqrdmulhq_n_s16(v652, 17734);
+ int16x8_t v654 = vaddq_s16(v644, v653);
+ int16x8_t v655 = vqrdmulhq_n_s16(v654, 16705);
+ int16x8_t v656 = vaddq_s16(v632, v655);
+ int16x8_t v657 = vaddq_s16(v634, v616);
+ int16x8_t v658_tmp = vqrdmulhq_n_s16(v657, 13573);
+ int16x8_t v658 = vaddq_s16(v658_tmp, v657);
+ int16x8_t v659 = vaddq_s16(v638, v618);
+ int16x8_t v660 = vaddq_s16(v619, v640);
+ int16x8_t v661 = vaddq_s16(v659, v660);
+ int16x8_t v662 = vaddq_s16(v658, v661);
+ int16x8_t v663 = vaddq_s16(v641, v622);
+ int16x8_t v664 = vaddq_s16(v623, v633);
+ int16x8_t v665 = vaddq_s16(v663, v664);
+ int16x8_t v666_tmp = vqrdmulhq_n_s16(v665, 13573);
+ int16x8_t v666 = vaddq_s16(v666_tmp, v665);
+ int16x8_t v667 = vaddq_s16(v648, v626);
+ int16x8_t v668 = vaddq_s16(v627, v637);
+ int16x8_t v669 = vaddq_s16(v667, v668);
+ int16x8_t v670 = vaddq_s16(v669, v665);
+ int16x8_t v671 = vaddq_s16(v666, v670);
+ int16x8_t v672 = vqrdmulhq_n_s16(v671, 17734);
+ int16x8_t v673 = vaddq_s16(v662, v672);
+ int16x8_t v674 = vaddq_s16(v664, v657);
+ int16x8_t v675_tmp = vqrdmulhq_n_s16(v674, 13573);
+ int16x8_t v675 = vaddq_s16(v675_tmp, v674);
+ int16x8_t v676 = vaddq_s16(v668, v659);
+ int16x8_t v677 = vaddq_s16(v660, v663);
+ int16x8_t v678 = vaddq_s16(v676, v677);
+ int16x8_t v679 = vaddq_s16(v675, v678);
+ int16x8_t v680 = vaddq_s16(v677, v674);
+ int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 13573);
+ int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+ int16x8_t v682 = vld1q_s16(in + in_stride * 127 + i);
+ int16x8_t v683 = vaddq_s16(v682, v507);
+ int16x8_t v684 = vaddq_s16(v683, v603);
+ int16x8_t v685 = vaddq_s16(v684, v647);
+ int16x8_t v686 = vaddq_s16(v685, v667);
+ int16x8_t v687 = vaddq_s16(v686, v676);
+ int16x8_t v688 = vaddq_s16(v687, v680);
+ int16x8_t v689 = vaddq_s16(v681, v688);
+ int16x8_t v690 = vqrdmulhq_n_s16(v689, 17734);
+ int16x8_t v691 = vaddq_s16(v679, v690);
+ int16x8_t v692 = vqrdmulhq_n_s16(v691, 16705);
+ int16x8_t v693 = vaddq_s16(v673, v692);
+ int16x8_t v694 = vqrdmulhq_n_s16(v693, 16463);
+ int16x8_t v695 = vaddq_s16(v656, v694);
+ int16x8_t v696 = vqrdmulhq_n_s16(v695, 16404);
+ int16x8_t v697 = vaddq_s16(v615, v696);
+ int16x8_t v698 = vqrdmulhq_n_s16(v697, 16389);
+ int16x8_t v699 = vaddq_s16(v522, v698);
+ int16x8_t v700 = vqrdmulhq_n_s16(v699, 16385);
+ int16x8_t v701 = vaddq_s16(v317, v700);
+ int16x8_t v702 = vsubq_s16(v0, v1);
+ int16x8_t v703 = vsubq_s16(v4, v6);
+ int16x8_t v704_tmp = vqrdmulhq_n_s16(v703, 10045);
+ int16x8_t v704 = vaddq_s16(v704_tmp, v703);
+ int16x8_t v705 = vaddq_s16(v702, v704);
+ int16x8_t v706 = vsubq_s16(v11, v14);
+ int16x8_t v707 = vsubq_s16(v17, v20);
+ int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 10045);
+ int16x8_t v708 = vaddq_s16(v708_tmp, v707);
+ int16x8_t v709 = vaddq_s16(v706, v708);
+ int16x8_t v710 = vqrdmulhq_n_s16(v709, 19705);
+ int16x8_t v711 = vaddq_s16(v705, v710);
+ int16x8_t v712 = vsubq_s16(v27, v30);
+ int16x8_t v713 = vsubq_s16(v35, v39);
+ int16x8_t v714_tmp = vqrdmulhq_n_s16(v713, 10045);
+ int16x8_t v714 = vaddq_s16(v714_tmp, v713);
+ int16x8_t v715 = vaddq_s16(v712, v714);
+ int16x8_t v716 = vsubq_s16(v44, v47);
+ int16x8_t v717 = vsubq_s16(v50, v54);
+ int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 10045);
+ int16x8_t v718 = vaddq_s16(v718_tmp, v717);
+ int16x8_t v719 = vaddq_s16(v716, v718);
+ int16x8_t v720 = vqrdmulhq_n_s16(v719, 19705);
+ int16x8_t v721 = vaddq_s16(v715, v720);
+ int16x8_t v722 = vqrdmulhq_n_s16(v721, 17121);
+ int16x8_t v723 = vaddq_s16(v711, v722);
+ int16x8_t v724 = vsubq_s16(v63, v66);
+ int16x8_t v725 = vsubq_s16(v71, v75);
+ int16x8_t v726_tmp = vqrdmulhq_n_s16(v725, 10045);
+ int16x8_t v726 = vaddq_s16(v726_tmp, v725);
+ int16x8_t v727 = vaddq_s16(v724, v726);
+ int16x8_t v728 = vsubq_s16(v82, v89);
+ int16x8_t v729 = vsubq_s16(v92, v97);
+ int16x8_t v730_tmp = vqrdmulhq_n_s16(v729, 10045);
+ int16x8_t v730 = vaddq_s16(v730_tmp, v729);
+ int16x8_t v731 = vaddq_s16(v728, v730);
+ int16x8_t v732 = vqrdmulhq_n_s16(v731, 19705);
+ int16x8_t v733 = vaddq_s16(v727, v732);
+ int16x8_t v734 = vsubq_s16(v104, v107);
+ int16x8_t v735 = vsubq_s16(v112, v116);
+ int16x8_t v736_tmp = vqrdmulhq_n_s16(v735, 10045);
+ int16x8_t v736 = vaddq_s16(v736_tmp, v735);
+ int16x8_t v737 = vaddq_s16(v734, v736);
+ int16x8_t v738 = vsubq_s16(v121, v124);
+ int16x8_t v739 = vsubq_s16(v127, v132);
+ int16x8_t v740_tmp = vqrdmulhq_n_s16(v739, 10045);
+ int16x8_t v740 = vaddq_s16(v740_tmp, v739);
+ int16x8_t v741 = vaddq_s16(v738, v740);
+ int16x8_t v742 = vqrdmulhq_n_s16(v741, 19705);
+ int16x8_t v743 = vaddq_s16(v737, v742);
+ int16x8_t v744 = vqrdmulhq_n_s16(v743, 17121);
+ int16x8_t v745 = vaddq_s16(v733, v744);
+ int16x8_t v746 = vqrdmulhq_n_s16(v745, 16563);
+ int16x8_t v747 = vaddq_s16(v723, v746);
+ int16x8_t v748 = vsubq_s16(v143, v146);
+ int16x8_t v749 = vsubq_s16(v151, v155);
+ int16x8_t v750_tmp = vqrdmulhq_n_s16(v749, 10045);
+ int16x8_t v750 = vaddq_s16(v750_tmp, v749);
+ int16x8_t v751 = vaddq_s16(v748, v750);
+ int16x8_t v752 = vsubq_s16(v162, v169);
+ int16x8_t v753 = vqrdmulhq_n_s16(v752, 19705);
+ int16x8_t v754 = vsubq_s16(v172, v177);
+ int16x8_t v755 = vqrdmulhq_n_s16(v754, 25746);
+ int16x8_t v756 = vaddq_s16(v753, v755);
+ int16x8_t v757 = vaddq_s16(v751, v756);
+ int16x8_t v758 = vsubq_s16(v186, v193);
+ int16x8_t v759 = vsubq_s16(v202, v210);
+ int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 10045);
+ int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+ int16x8_t v761 = vaddq_s16(v758, v760);
+ int16x8_t v762 = vsubq_s16(v215, v218);
+ int16x8_t v763 = vsubq_s16(v221, v227);
+ int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 10045);
+ int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+ int16x8_t v765 = vaddq_s16(v762, v764);
+ int16x8_t v766 = vqrdmulhq_n_s16(v765, 19705);
+ int16x8_t v767 = vaddq_s16(v761, v766);
+ int16x8_t v768 = vqrdmulhq_n_s16(v767, 17121);
+ int16x8_t v769 = vaddq_s16(v757, v768);
+ int16x8_t v770 = vsubq_s16(v236, v239);
+ int16x8_t v771 = vsubq_s16(v244, v248);
+ int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 10045);
+ int16x8_t v772 = vaddq_s16(v772_tmp, v771);
+ int16x8_t v773 = vaddq_s16(v770, v772);
+ int16x8_t v774 = vsubq_s16(v255, v262);
+ int16x8_t v775 = vsubq_s16(v265, v270);
+ int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 10045);
+ int16x8_t v776 = vaddq_s16(v776_tmp, v775);
+ int16x8_t v777 = vaddq_s16(v774, v776);
+ int16x8_t v778 = vqrdmulhq_n_s16(v777, 19705);
+ int16x8_t v779 = vaddq_s16(v773, v778);
+ int16x8_t v780 = vsubq_s16(v277, v280);
+ int16x8_t v781 = vsubq_s16(v285, v289);
+ int16x8_t v782_tmp = vqrdmulhq_n_s16(v781, 10045);
+ int16x8_t v782 = vaddq_s16(v782_tmp, v781);
+ int16x8_t v783 = vaddq_s16(v780, v782);
+ int16x8_t v784 = vsubq_s16(v294, v297);
+ int16x8_t v785 = vsubq_s16(v300, v306);
+ int16x8_t v786_tmp = vqrdmulhq_n_s16(v785, 10045);
+ int16x8_t v786 = vaddq_s16(v786_tmp, v785);
+ int16x8_t v787 = vaddq_s16(v784, v786);
+ int16x8_t v788 = vqrdmulhq_n_s16(v787, 19705);
+ int16x8_t v789 = vaddq_s16(v783, v788);
+ int16x8_t v790 = vqrdmulhq_n_s16(v789, 17121);
+ int16x8_t v791 = vaddq_s16(v779, v790);
+ int16x8_t v792 = vqrdmulhq_n_s16(v791, 16563);
+ int16x8_t v793 = vaddq_s16(v769, v792);
+ int16x8_t v794 = vqrdmulhq_n_s16(v793, 16429);
+ int16x8_t v795 = vaddq_s16(v747, v794);
+ int16x8_t v796 = vsubq_s16(v319, v322);
+ int16x8_t v797 = vsubq_s16(v327, v331);
+ int16x8_t v798_tmp = vqrdmulhq_n_s16(v797, 10045);
+ int16x8_t v798 = vaddq_s16(v798_tmp, v797);
+ int16x8_t v799 = vaddq_s16(v796, v798);
+ int16x8_t v800 = vsubq_s16(v338, v345);
+ int16x8_t v801 = vsubq_s16(v348, v353);
+ int16x8_t v802_tmp = vqrdmulhq_n_s16(v801, 10045);
+ int16x8_t v802 = vaddq_s16(v802_tmp, v801);
+ int16x8_t v803 = vaddq_s16(v800, v802);
+ int16x8_t v804 = vqrdmulhq_n_s16(v803, 19705);
+ int16x8_t v805 = vaddq_s16(v799, v804);
+ int16x8_t v806 = vsubq_s16(v362, v369);
+ int16x8_t v807 = vsubq_s16(v378, v386);
+ int16x8_t v808_tmp = vqrdmulhq_n_s16(v807, 10045);
+ int16x8_t v808 = vaddq_s16(v808_tmp, v807);
+ int16x8_t v809 = vaddq_s16(v806, v808);
+ int16x8_t v810 = vsubq_s16(v391, v394);
+ int16x8_t v811 = vsubq_s16(v397, v403);
+ int16x8_t v812_tmp = vqrdmulhq_n_s16(v811, 10045);
+ int16x8_t v812 = vaddq_s16(v812_tmp, v811);
+ int16x8_t v813 = vaddq_s16(v810, v812);
+ int16x8_t v814 = vqrdmulhq_n_s16(v813, 19705);
+ int16x8_t v815 = vaddq_s16(v809, v814);
+ int16x8_t v816 = vqrdmulhq_n_s16(v815, 17121);
+ int16x8_t v817 = vaddq_s16(v805, v816);
+ int16x8_t v818 = vsubq_s16(v414, v421);
+ int16x8_t v819 = vsubq_s16(v430, v438);
+ int16x8_t v820_tmp = vqrdmulhq_n_s16(v819, 10045);
+ int16x8_t v820 = vaddq_s16(v820_tmp, v819);
+ int16x8_t v821 = vaddq_s16(v818, v820);
+ int16x8_t v822 = vsubq_s16(v449, v464);
+ int16x8_t v823 = vsubq_s16(v467, v476);
+ int16x8_t v824_tmp = vqrdmulhq_n_s16(v823, 10045);
+ int16x8_t v824 = vaddq_s16(v824_tmp, v823);
+ int16x8_t v825 = vaddq_s16(v822, v824);
+ int16x8_t v826 = vqrdmulhq_n_s16(v825, 19705);
+ int16x8_t v827 = vaddq_s16(v821, v826);
+ int16x8_t v828 = vsubq_s16(v483, v486);
+ int16x8_t v829 = vsubq_s16(v491, v495);
+ int16x8_t v830_tmp = vqrdmulhq_n_s16(v829, 10045);
+ int16x8_t v830 = vaddq_s16(v830_tmp, v829);
+ int16x8_t v831 = vaddq_s16(v828, v830);
+ int16x8_t v832 = vsubq_s16(v500, v503);
+ int16x8_t v833 = vsubq_s16(v506, v513);
+ int16x8_t v834_tmp = vqrdmulhq_n_s16(v833, 10045);
+ int16x8_t v834 = vaddq_s16(v834_tmp, v833);
+ int16x8_t v835 = vaddq_s16(v832, v834);
+ int16x8_t v836 = vqrdmulhq_n_s16(v835, 19705);
+ int16x8_t v837 = vaddq_s16(v831, v836);
+ int16x8_t v838 = vqrdmulhq_n_s16(v837, 17121);
+ int16x8_t v839 = vaddq_s16(v827, v838);
+ int16x8_t v840 = vqrdmulhq_n_s16(v839, 16563);
+ int16x8_t v841 = vaddq_s16(v817, v840);
+ int16x8_t v842 = vsubq_s16(v524, v527);
+ int16x8_t v843 = vsubq_s16(v532, v536);
+ int16x8_t v844_tmp = vqrdmulhq_n_s16(v843, 10045);
+ int16x8_t v844 = vaddq_s16(v844_tmp, v843);
+ int16x8_t v845 = vaddq_s16(v842, v844);
+ int16x8_t v846 = vsubq_s16(v543, v550);
+ int16x8_t v847 = vsubq_s16(v553, v558);
+ int16x8_t v848_tmp = vqrdmulhq_n_s16(v847, 10045);
+ int16x8_t v848 = vaddq_s16(v848_tmp, v847);
+ int16x8_t v849 = vaddq_s16(v846, v848);
+ int16x8_t v850 = vqrdmulhq_n_s16(v849, 19705);
+ int16x8_t v851 = vaddq_s16(v845, v850);
+ int16x8_t v852 = vsubq_s16(v567, v574);
+ int16x8_t v853 = vsubq_s16(v583, v591);
+ int16x8_t v854_tmp = vqrdmulhq_n_s16(v853, 10045);
+ int16x8_t v854 = vaddq_s16(v854_tmp, v853);
+ int16x8_t v855 = vaddq_s16(v852, v854);
+ int16x8_t v856 = vsubq_s16(v596, v599);
+ int16x8_t v857 = vsubq_s16(v602, v608);
+ int16x8_t v858_tmp = vqrdmulhq_n_s16(v857, 10045);
+ int16x8_t v858 = vaddq_s16(v858_tmp, v857);
+ int16x8_t v859 = vaddq_s16(v856, v858);
+ int16x8_t v860 = vqrdmulhq_n_s16(v859, 19705);
+ int16x8_t v861 = vaddq_s16(v855, v860);
+ int16x8_t v862 = vqrdmulhq_n_s16(v861, 17121);
+ int16x8_t v863 = vaddq_s16(v851, v862);
+ int16x8_t v864 = vsubq_s16(v617, v620);
+ int16x8_t v865 = vsubq_s16(v625, v629);
+ int16x8_t v866_tmp = vqrdmulhq_n_s16(v865, 10045);
+ int16x8_t v866 = vaddq_s16(v866_tmp, v865);
+ int16x8_t v867 = vaddq_s16(v864, v866);
+ int16x8_t v868 = vsubq_s16(v636, v643);
+ int16x8_t v869 = vsubq_s16(v646, v651);
+ int16x8_t v870_tmp = vqrdmulhq_n_s16(v869, 10045);
+ int16x8_t v870 = vaddq_s16(v870_tmp, v869);
+ int16x8_t v871 = vaddq_s16(v868, v870);
+ int16x8_t v872 = vqrdmulhq_n_s16(v871, 19705);
+ int16x8_t v873 = vaddq_s16(v867, v872);
+ int16x8_t v874 = vsubq_s16(v658, v661);
+ int16x8_t v875 = vsubq_s16(v666, v670);
+ int16x8_t v876_tmp = vqrdmulhq_n_s16(v875, 10045);
+ int16x8_t v876 = vaddq_s16(v876_tmp, v875);
+ int16x8_t v877 = vaddq_s16(v874, v876);
+ int16x8_t v878 = vsubq_s16(v675, v678);
+ int16x8_t v879 = vsubq_s16(v681, v688);
+ int16x8_t v880_tmp = vqrdmulhq_n_s16(v879, 10045);
+ int16x8_t v880 = vaddq_s16(v880_tmp, v879);
+ int16x8_t v881 = vaddq_s16(v878, v880);
+ int16x8_t v882 = vqrdmulhq_n_s16(v881, 19705);
+ int16x8_t v883 = vaddq_s16(v877, v882);
+ int16x8_t v884 = vqrdmulhq_n_s16(v883, 17121);
+ int16x8_t v885 = vaddq_s16(v873, v884);
+ int16x8_t v886 = vqrdmulhq_n_s16(v885, 16563);
+ int16x8_t v887 = vaddq_s16(v863, v886);
+ int16x8_t v888 = vqrdmulhq_n_s16(v887, 16429);
+ int16x8_t v889 = vaddq_s16(v841, v888);
+ int16x8_t v890 = vqrdmulhq_n_s16(v889, 16395);
+ int16x8_t v891 = vaddq_s16(v795, v890);
+ int16x8_t v892 = vsubq_s16(v702, v704);
+ int16x8_t v893 = vsubq_s16(v706, v708);
+ int16x8_t v894 = vqrdmulhq_n_s16(v893, 29490);
+ int16x8_t v895 = vaddq_s16(v892, v894);
+ int16x8_t v896 = vsubq_s16(v712, v714);
+ int16x8_t v897 = vsubq_s16(v716, v718);
+ int16x8_t v898 = vqrdmulhq_n_s16(v897, 29490);
+ int16x8_t v899 = vaddq_s16(v896, v898);
+ int16x8_t v900 = vqrdmulhq_n_s16(v899, 18578);
+ int16x8_t v901 = vaddq_s16(v895, v900);
+ int16x8_t v902 = vsubq_s16(v724, v726);
+ int16x8_t v903 = vsubq_s16(v728, v730);
+ int16x8_t v904 = vqrdmulhq_n_s16(v903, 29490);
+ int16x8_t v905 = vaddq_s16(v902, v904);
+ int16x8_t v906 = vsubq_s16(v734, v736);
+ int16x8_t v907 = vsubq_s16(v738, v740);
+ int16x8_t v908 = vqrdmulhq_n_s16(v907, 29490);
+ int16x8_t v909 = vaddq_s16(v906, v908);
+ int16x8_t v910 = vqrdmulhq_n_s16(v909, 18578);
+ int16x8_t v911 = vaddq_s16(v905, v910);
+ int16x8_t v912 = vqrdmulhq_n_s16(v911, 16890);
+ int16x8_t v913 = vaddq_s16(v901, v912);
+ int16x8_t v914 = vsubq_s16(v748, v750);
+ int16x8_t v915_tmp = vqrdmulhq_n_s16(v754, 10045);
+ int16x8_t v915 = vaddq_s16(v915_tmp, v754);
+ int16x8_t v916 = vsubq_s16(v752, v915);
+ int16x8_t v917 = vqrdmulhq_n_s16(v916, 29490);
+ int16x8_t v918 = vaddq_s16(v914, v917);
+ int16x8_t v919 = vsubq_s16(v758, v760);
+ int16x8_t v920 = vsubq_s16(v762, v764);
+ int16x8_t v921 = vqrdmulhq_n_s16(v920, 29490);
+ int16x8_t v922 = vaddq_s16(v919, v921);
+ int16x8_t v923 = vqrdmulhq_n_s16(v922, 18578);
+ int16x8_t v924 = vaddq_s16(v918, v923);
+ int16x8_t v925 = vsubq_s16(v770, v772);
+ int16x8_t v926 = vsubq_s16(v774, v776);
+ int16x8_t v927 = vqrdmulhq_n_s16(v926, 29490);
+ int16x8_t v928 = vaddq_s16(v925, v927);
+ int16x8_t v929 = vsubq_s16(v780, v782);
+ int16x8_t v930 = vsubq_s16(v784, v786);
+ int16x8_t v931 = vqrdmulhq_n_s16(v930, 29490);
+ int16x8_t v932 = vaddq_s16(v929, v931);
+ int16x8_t v933 = vqrdmulhq_n_s16(v932, 18578);
+ int16x8_t v934 = vaddq_s16(v928, v933);
+ int16x8_t v935 = vqrdmulhq_n_s16(v934, 16890);
+ int16x8_t v936 = vaddq_s16(v924, v935);
+ int16x8_t v937 = vqrdmulhq_n_s16(v936, 16508);
+ int16x8_t v938 = vaddq_s16(v913, v937);
+ int16x8_t v939 = vsubq_s16(v796, v798);
+ int16x8_t v940 = vsubq_s16(v800, v802);
+ int16x8_t v941 = vqrdmulhq_n_s16(v940, 29490);
+ int16x8_t v942 = vaddq_s16(v939, v941);
+ int16x8_t v943 = vsubq_s16(v806, v808);
+ int16x8_t v944 = vsubq_s16(v810, v812);
+ int16x8_t v945 = vqrdmulhq_n_s16(v944, 29490);
+ int16x8_t v946 = vaddq_s16(v943, v945);
+ int16x8_t v947 = vqrdmulhq_n_s16(v946, 18578);
+ int16x8_t v948 = vaddq_s16(v942, v947);
+ int16x8_t v949 = vsubq_s16(v818, v820);
+ int16x8_t v950 = vsubq_s16(v822, v824);
+ int16x8_t v951 = vqrdmulhq_n_s16(v950, 29490);
+ int16x8_t v952 = vaddq_s16(v949, v951);
+ int16x8_t v953 = vsubq_s16(v828, v830);
+ int16x8_t v954 = vsubq_s16(v832, v834);
+ int16x8_t v955 = vqrdmulhq_n_s16(v954, 29490);
+ int16x8_t v956 = vaddq_s16(v953, v955);
+ int16x8_t v957 = vqrdmulhq_n_s16(v956, 18578);
+ int16x8_t v958 = vaddq_s16(v952, v957);
+ int16x8_t v959 = vqrdmulhq_n_s16(v958, 16890);
+ int16x8_t v960 = vaddq_s16(v948, v959);
+ int16x8_t v961 = vsubq_s16(v842, v844);
+ int16x8_t v962 = vsubq_s16(v846, v848);
+ int16x8_t v963 = vqrdmulhq_n_s16(v962, 29490);
+ int16x8_t v964 = vaddq_s16(v961, v963);
+ int16x8_t v965 = vsubq_s16(v852, v854);
+ int16x8_t v966 = vsubq_s16(v856, v858);
+ int16x8_t v967 = vqrdmulhq_n_s16(v966, 29490);
+ int16x8_t v968 = vaddq_s16(v965, v967);
+ int16x8_t v969 = vqrdmulhq_n_s16(v968, 18578);
+ int16x8_t v970 = vaddq_s16(v964, v969);
+ int16x8_t v971 = vsubq_s16(v864, v866);
+ int16x8_t v972 = vsubq_s16(v868, v870);
+ int16x8_t v973 = vqrdmulhq_n_s16(v972, 29490);
+ int16x8_t v974 = vaddq_s16(v971, v973);
+ int16x8_t v975 = vsubq_s16(v874, v876);
+ int16x8_t v976 = vsubq_s16(v878, v880);
+ int16x8_t v977 = vqrdmulhq_n_s16(v976, 29490);
+ int16x8_t v978 = vaddq_s16(v975, v977);
+ int16x8_t v979 = vqrdmulhq_n_s16(v978, 18578);
+ int16x8_t v980 = vaddq_s16(v974, v979);
+ int16x8_t v981 = vqrdmulhq_n_s16(v980, 16890);
+ int16x8_t v982 = vaddq_s16(v970, v981);
+ int16x8_t v983 = vqrdmulhq_n_s16(v982, 16508);
+ int16x8_t v984 = vaddq_s16(v960, v983);
+ int16x8_t v985 = vqrdmulhq_n_s16(v984, 16415);
+ int16x8_t v986 = vaddq_s16(v938, v985);
+ int16x8_t v987 = vsubq_s16(v2, v8);
+ int16x8_t v988 = vsubq_s16(v15, v22);
+ int16x8_t v989_tmp = vqrdmulhq_n_s16(v988, 18446);
+ int16x8_t v989 = vmlaq_n_s16(v989_tmp, v988, 2);
+ int16x8_t v990 = vaddq_s16(v987, v989);
+ int16x8_t v991 = vsubq_s16(v31, v41);
+ int16x8_t v992 = vsubq_s16(v48, v56);
+ int16x8_t v993_tmp = vqrdmulhq_n_s16(v992, 18446);
+ int16x8_t v993 = vmlaq_n_s16(v993_tmp, v992, 2);
+ int16x8_t v994 = vaddq_s16(v991, v993);
+ int16x8_t v995 = vqrdmulhq_n_s16(v994, 21195);
+ int16x8_t v996 = vaddq_s16(v990, v995);
+ int16x8_t v997 = vsubq_s16(v67, v77);
+ int16x8_t v998 = vsubq_s16(v90, v99);
+ int16x8_t v999_tmp = vqrdmulhq_n_s16(v998, 18446);
+ int16x8_t v999 = vmlaq_n_s16(v999_tmp, v998, 2);
+ int16x8_t v1000 = vaddq_s16(v997, v999);
+ int16x8_t v1001 = vsubq_s16(v108, v118);
+ int16x8_t v1002 = vsubq_s16(v125, v134);
+ int16x8_t v1003_tmp = vqrdmulhq_n_s16(v1002, 18446);
+ int16x8_t v1003 = vmlaq_n_s16(v1003_tmp, v1002, 2);
+ int16x8_t v1004 = vaddq_s16(v1001, v1003);
+ int16x8_t v1005 = vqrdmulhq_n_s16(v1004, 21195);
+ int16x8_t v1006 = vaddq_s16(v1000, v1005);
+ int16x8_t v1007 = vqrdmulhq_n_s16(v1006, 17401);
+ int16x8_t v1008 = vaddq_s16(v996, v1007);
+ int16x8_t v1009 = vsubq_s16(v147, v157);
+ int16x8_t v1010 = vsubq_s16(v170, v179);
+ int16x8_t v1011_tmp = vqrdmulhq_n_s16(v1010, 18446);
+ int16x8_t v1011 = vmlaq_n_s16(v1011_tmp, v1010, 2);
+ int16x8_t v1012 = vaddq_s16(v1009, v1011);
+ int16x8_t v1013 = vsubq_s16(v194, v212);
+ int16x8_t v1014 = vsubq_s16(v219, v229);
+ int16x8_t v1015_tmp = vqrdmulhq_n_s16(v1014, 18446);
+ int16x8_t v1015 = vmlaq_n_s16(v1015_tmp, v1014, 2);
+ int16x8_t v1016 = vaddq_s16(v1013, v1015);
+ int16x8_t v1017 = vqrdmulhq_n_s16(v1016, 21195);
+ int16x8_t v1018 = vaddq_s16(v1012, v1017);
+ int16x8_t v1019 = vsubq_s16(v240, v250);
+ int16x8_t v1020 = vsubq_s16(v263, v272);
+ int16x8_t v1021_tmp = vqrdmulhq_n_s16(v1020, 18446);
+ int16x8_t v1021 = vmlaq_n_s16(v1021_tmp, v1020, 2);
+ int16x8_t v1022 = vaddq_s16(v1019, v1021);
+ int16x8_t v1023 = vsubq_s16(v281, v291);
+ int16x8_t v1024 = vsubq_s16(v298, v308);
+ int16x8_t v1025_tmp = vqrdmulhq_n_s16(v1024, 18446);
+ int16x8_t v1025 = vmlaq_n_s16(v1025_tmp, v1024, 2);
+ int16x8_t v1026 = vaddq_s16(v1023, v1025);
+ int16x8_t v1027 = vqrdmulhq_n_s16(v1026, 21195);
+ int16x8_t v1028 = vaddq_s16(v1022, v1027);
+ int16x8_t v1029 = vqrdmulhq_n_s16(v1028, 17401);
+ int16x8_t v1030 = vaddq_s16(v1018, v1029);
+ int16x8_t v1031 = vqrdmulhq_n_s16(v1030, 16629);
+ int16x8_t v1032 = vaddq_s16(v1008, v1031);
+ int16x8_t v1033 = vsubq_s16(v323, v333);
+ int16x8_t v1034 = vsubq_s16(v346, v355);
+ int16x8_t v1035_tmp = vqrdmulhq_n_s16(v1034, 18446);
+ int16x8_t v1035 = vmlaq_n_s16(v1035_tmp, v1034, 2);
+ int16x8_t v1036 = vaddq_s16(v1033, v1035);
+ int16x8_t v1037 = vsubq_s16(v370, v388);
+ int16x8_t v1038 = vsubq_s16(v395, v405);
+ int16x8_t v1039_tmp = vqrdmulhq_n_s16(v1038, 18446);
+ int16x8_t v1039 = vmlaq_n_s16(v1039_tmp, v1038, 2);
+ int16x8_t v1040 = vaddq_s16(v1037, v1039);
+ int16x8_t v1041 = vqrdmulhq_n_s16(v1040, 21195);
+ int16x8_t v1042 = vaddq_s16(v1036, v1041);
+ int16x8_t v1043 = vsubq_s16(v422, v440);
+ int16x8_t v1044 = vsubq_s16(v465, v478);
+ int16x8_t v1045_tmp = vqrdmulhq_n_s16(v1044, 18446);
+ int16x8_t v1045 = vmlaq_n_s16(v1045_tmp, v1044, 2);
+ int16x8_t v1046 = vaddq_s16(v1043, v1045);
+ int16x8_t v1047 = vsubq_s16(v487, v497);
+ int16x8_t v1048 = vsubq_s16(v504, v515);
+ int16x8_t v1049_tmp = vqrdmulhq_n_s16(v1048, 18446);
+ int16x8_t v1049 = vmlaq_n_s16(v1049_tmp, v1048, 2);
+ int16x8_t v1050 = vaddq_s16(v1047, v1049);
+ int16x8_t v1051 = vqrdmulhq_n_s16(v1050, 21195);
+ int16x8_t v1052 = vaddq_s16(v1046, v1051);
+ int16x8_t v1053 = vqrdmulhq_n_s16(v1052, 17401);
+ int16x8_t v1054 = vaddq_s16(v1042, v1053);
+ int16x8_t v1055 = vsubq_s16(v528, v538);
+ int16x8_t v1056 = vsubq_s16(v551, v560);
+ int16x8_t v1057_tmp = vqrdmulhq_n_s16(v1056, 18446);
+ int16x8_t v1057 = vmlaq_n_s16(v1057_tmp, v1056, 2);
+ int16x8_t v1058 = vaddq_s16(v1055, v1057);
+ int16x8_t v1059 = vsubq_s16(v575, v593);
+ int16x8_t v1060 = vsubq_s16(v600, v610);
+ int16x8_t v1061_tmp = vqrdmulhq_n_s16(v1060, 18446);
+ int16x8_t v1061 = vmlaq_n_s16(v1061_tmp, v1060, 2);
+ int16x8_t v1062 = vaddq_s16(v1059, v1061);
+ int16x8_t v1063 = vqrdmulhq_n_s16(v1062, 21195);
+ int16x8_t v1064 = vaddq_s16(v1058, v1063);
+ int16x8_t v1065 = vsubq_s16(v621, v631);
+ int16x8_t v1066 = vsubq_s16(v644, v653);
+ int16x8_t v1067_tmp = vqrdmulhq_n_s16(v1066, 18446);
+ int16x8_t v1067 = vmlaq_n_s16(v1067_tmp, v1066, 2);
+ int16x8_t v1068 = vaddq_s16(v1065, v1067);
+ int16x8_t v1069 = vsubq_s16(v662, v672);
+ int16x8_t v1070 = vsubq_s16(v679, v690);
+ int16x8_t v1071_tmp = vqrdmulhq_n_s16(v1070, 18446);
+ int16x8_t v1071 = vmlaq_n_s16(v1071_tmp, v1070, 2);
+ int16x8_t v1072 = vaddq_s16(v1069, v1071);
+ int16x8_t v1073 = vqrdmulhq_n_s16(v1072, 21195);
+ int16x8_t v1074 = vaddq_s16(v1068, v1073);
+ int16x8_t v1075 = vqrdmulhq_n_s16(v1074, 17401);
+ int16x8_t v1076 = vaddq_s16(v1064, v1075);
+ int16x8_t v1077 = vqrdmulhq_n_s16(v1076, 16629);
+ int16x8_t v1078 = vaddq_s16(v1054, v1077);
+ int16x8_t v1079 = vqrdmulhq_n_s16(v1078, 16445);
+ int16x8_t v1080 = vaddq_s16(v1032, v1079);
+ int16x8_t v1081 = vsubq_s16(v987, v989);
+ int16x8_t v1082 = vsubq_s16(v991, v993);
+ int16x8_t v1083 = vqrdmulhq_n_s16(v1082, 25826);
+ int16x8_t v1084 = vaddq_s16(v1081, v1083);
+ int16x8_t v1085 = vsubq_s16(v997, v999);
+ int16x8_t v1086 = vsubq_s16(v1001, v1003);
+ int16x8_t v1087 = vqrdmulhq_n_s16(v1086, 25826);
+ int16x8_t v1088 = vaddq_s16(v1085, v1087);
+ int16x8_t v1089 = vqrdmulhq_n_s16(v1088, 18124);
+ int16x8_t v1090 = vaddq_s16(v1084, v1089);
+ int16x8_t v1091 = vsubq_s16(v1009, v1011);
+ int16x8_t v1092 = vsubq_s16(v1013, v1015);
+ int16x8_t v1093 = vqrdmulhq_n_s16(v1092, 25826);
+ int16x8_t v1094 = vaddq_s16(v1091, v1093);
+ int16x8_t v1095 = vsubq_s16(v1019, v1021);
+ int16x8_t v1096 = vsubq_s16(v1023, v1025);
+ int16x8_t v1097 = vqrdmulhq_n_s16(v1096, 25826);
+ int16x8_t v1098 = vaddq_s16(v1095, v1097);
+ int16x8_t v1099 = vqrdmulhq_n_s16(v1098, 18124);
+ int16x8_t v1100 = vaddq_s16(v1094, v1099);
+ int16x8_t v1101 = vqrdmulhq_n_s16(v1100, 16792);
+ int16x8_t v1102 = vaddq_s16(v1090, v1101);
+ int16x8_t v1103 = vsubq_s16(v1033, v1035);
+ int16x8_t v1104 = vsubq_s16(v1037, v1039);
+ int16x8_t v1105 = vqrdmulhq_n_s16(v1104, 25826);
+ int16x8_t v1106 = vaddq_s16(v1103, v1105);
+ int16x8_t v1107 = vsubq_s16(v1043, v1045);
+ int16x8_t v1108 = vsubq_s16(v1047, v1049);
+ int16x8_t v1109 = vqrdmulhq_n_s16(v1108, 25826);
+ int16x8_t v1110 = vaddq_s16(v1107, v1109);
+ int16x8_t v1111 = vqrdmulhq_n_s16(v1110, 18124);
+ int16x8_t v1112 = vaddq_s16(v1106, v1111);
+ int16x8_t v1113 = vsubq_s16(v1055, v1057);
+ int16x8_t v1114 = vsubq_s16(v1059, v1061);
+ int16x8_t v1115 = vqrdmulhq_n_s16(v1114, 25826);
+ int16x8_t v1116 = vaddq_s16(v1113, v1115);
+ int16x8_t v1117 = vsubq_s16(v1065, v1067);
+ int16x8_t v1118 = vsubq_s16(v1069, v1071);
+ int16x8_t v1119 = vqrdmulhq_n_s16(v1118, 25826);
+ int16x8_t v1120 = vaddq_s16(v1117, v1119);
+ int16x8_t v1121 = vqrdmulhq_n_s16(v1120, 18124);
+ int16x8_t v1122 = vaddq_s16(v1116, v1121);
+ int16x8_t v1123 = vqrdmulhq_n_s16(v1122, 16792);
+ int16x8_t v1124 = vaddq_s16(v1112, v1123);
+ int16x8_t v1125 = vqrdmulhq_n_s16(v1124, 16484);
+ int16x8_t v1126 = vaddq_s16(v1102, v1125);
+ int16x8_t v1127 = vsubq_s16(v892, v894);
+ int16x8_t v1128 = vsubq_s16(v896, v898);
+ int16x8_t v1129_tmp = vqrdmulhq_n_s16(v1128, 1988);
+ int16x8_t v1129 = vaddq_s16(v1129_tmp, v1128);
+ int16x8_t v1130 = vaddq_s16(v1127, v1129);
+ int16x8_t v1131 = vsubq_s16(v902, v904);
+ int16x8_t v1132 = vsubq_s16(v906, v908);
+ int16x8_t v1133_tmp = vqrdmulhq_n_s16(v1132, 1988);
+ int16x8_t v1133 = vaddq_s16(v1133_tmp, v1132);
+ int16x8_t v1134 = vaddq_s16(v1131, v1133);
+ int16x8_t v1135 = vqrdmulhq_n_s16(v1134, 19102);
+ int16x8_t v1136 = vaddq_s16(v1130, v1135);
+ int16x8_t v1137 = vsubq_s16(v914, v917);
+ int16x8_t v1138 = vsubq_s16(v919, v921);
+ int16x8_t v1139_tmp = vqrdmulhq_n_s16(v1138, 1988);
+ int16x8_t v1139 = vaddq_s16(v1139_tmp, v1138);
+ int16x8_t v1140 = vaddq_s16(v1137, v1139);
+ int16x8_t v1141 = vsubq_s16(v925, v927);
+ int16x8_t v1142 = vsubq_s16(v929, v931);
+ int16x8_t v1143_tmp = vqrdmulhq_n_s16(v1142, 1988);
+ int16x8_t v1143 = vaddq_s16(v1143_tmp, v1142);
+ int16x8_t v1144 = vaddq_s16(v1141, v1143);
+ int16x8_t v1145 = vqrdmulhq_n_s16(v1144, 19102);
+ int16x8_t v1146 = vaddq_s16(v1140, v1145);
+ int16x8_t v1147 = vqrdmulhq_n_s16(v1146, 17000);
+ int16x8_t v1148 = vaddq_s16(v1136, v1147);
+ int16x8_t v1149 = vsubq_s16(v939, v941);
+ int16x8_t v1150 = vsubq_s16(v943, v945);
+ int16x8_t v1151_tmp = vqrdmulhq_n_s16(v1150, 1988);
+ int16x8_t v1151 = vaddq_s16(v1151_tmp, v1150);
+ int16x8_t v1152 = vaddq_s16(v1149, v1151);
+ int16x8_t v1153 = vsubq_s16(v949, v951);
+ int16x8_t v1154 = vsubq_s16(v953, v955);
+ int16x8_t v1155_tmp = vqrdmulhq_n_s16(v1154, 1988);
+ int16x8_t v1155 = vaddq_s16(v1155_tmp, v1154);
+ int16x8_t v1156 = vaddq_s16(v1153, v1155);
+ int16x8_t v1157 = vqrdmulhq_n_s16(v1156, 19102);
+ int16x8_t v1158 = vaddq_s16(v1152, v1157);
+ int16x8_t v1159 = vsubq_s16(v961, v963);
+ int16x8_t v1160 = vsubq_s16(v965, v967);
+ int16x8_t v1161_tmp = vqrdmulhq_n_s16(v1160, 1988);
+ int16x8_t v1161 = vaddq_s16(v1161_tmp, v1160);
+ int16x8_t v1162 = vaddq_s16(v1159, v1161);
+ int16x8_t v1163 = vsubq_s16(v971, v973);
+ int16x8_t v1164 = vsubq_s16(v975, v977);
+ int16x8_t v1165_tmp = vqrdmulhq_n_s16(v1164, 1988);
+ int16x8_t v1165 = vaddq_s16(v1165_tmp, v1164);
+ int16x8_t v1166 = vaddq_s16(v1163, v1165);
+ int16x8_t v1167 = vqrdmulhq_n_s16(v1166, 19102);
+ int16x8_t v1168 = vaddq_s16(v1162, v1167);
+ int16x8_t v1169 = vqrdmulhq_n_s16(v1168, 17000);
+ int16x8_t v1170 = vaddq_s16(v1158, v1169);
+ int16x8_t v1171 = vqrdmulhq_n_s16(v1170, 16534);
+ int16x8_t v1172 = vaddq_s16(v1148, v1171);
+ int16x8_t v1173 = vsubq_s16(v705, v710);
+ int16x8_t v1174 = vsubq_s16(v715, v720);
+ int16x8_t v1175_tmp = vqrdmulhq_n_s16(v1174, 23673);
+ int16x8_t v1175 = vaddq_s16(v1175_tmp, v1174);
+ int16x8_t v1176 = vaddq_s16(v1173, v1175);
+ int16x8_t v1177 = vsubq_s16(v727, v732);
+ int16x8_t v1178 = vsubq_s16(v737, v742);
+ int16x8_t v1179_tmp = vqrdmulhq_n_s16(v1178, 23673);
+ int16x8_t v1179 = vaddq_s16(v1179_tmp, v1178);
+ int16x8_t v1180 = vaddq_s16(v1177, v1179);
+ int16x8_t v1181 = vqrdmulhq_n_s16(v1180, 20398);
+ int16x8_t v1182 = vaddq_s16(v1176, v1181);
+ int16x8_t v1183 = vsubq_s16(v751, v756);
+ int16x8_t v1184 = vsubq_s16(v761, v766);
+ int16x8_t v1185_tmp = vqrdmulhq_n_s16(v1184, 23673);
+ int16x8_t v1185 = vaddq_s16(v1185_tmp, v1184);
+ int16x8_t v1186 = vaddq_s16(v1183, v1185);
+ int16x8_t v1187 = vsubq_s16(v773, v778);
+ int16x8_t v1188 = vsubq_s16(v783, v788);
+ int16x8_t v1189_tmp = vqrdmulhq_n_s16(v1188, 23673);
+ int16x8_t v1189 = vaddq_s16(v1189_tmp, v1188);
+ int16x8_t v1190 = vaddq_s16(v1187, v1189);
+ int16x8_t v1191 = vqrdmulhq_n_s16(v1190, 20398);
+ int16x8_t v1192 = vaddq_s16(v1186, v1191);
+ int16x8_t v1193 = vqrdmulhq_n_s16(v1192, 17255);
+ int16x8_t v1194 = vaddq_s16(v1182, v1193);
+ int16x8_t v1195 = vsubq_s16(v799, v804);
+ int16x8_t v1196 = vsubq_s16(v809, v814);
+ int16x8_t v1197_tmp = vqrdmulhq_n_s16(v1196, 23673);
+ int16x8_t v1197 = vaddq_s16(v1197_tmp, v1196);
+ int16x8_t v1198 = vaddq_s16(v1195, v1197);
+ int16x8_t v1199 = vsubq_s16(v821, v826);
+ int16x8_t v1200 = vsubq_s16(v831, v836);
+ int16x8_t v1201_tmp = vqrdmulhq_n_s16(v1200, 23673);
+ int16x8_t v1201 = vaddq_s16(v1201_tmp, v1200);
+ int16x8_t v1202 = vaddq_s16(v1199, v1201);
+ int16x8_t v1203 = vqrdmulhq_n_s16(v1202, 20398);
+ int16x8_t v1204 = vaddq_s16(v1198, v1203);
+ int16x8_t v1205 = vsubq_s16(v845, v850);
+ int16x8_t v1206 = vsubq_s16(v855, v860);
+ int16x8_t v1207_tmp = vqrdmulhq_n_s16(v1206, 23673);
+ int16x8_t v1207 = vaddq_s16(v1207_tmp, v1206);
+ int16x8_t v1208 = vaddq_s16(v1205, v1207);
+ int16x8_t v1209 = vsubq_s16(v867, v872);
+ int16x8_t v1210 = vsubq_s16(v877, v882);
+ int16x8_t v1211_tmp = vqrdmulhq_n_s16(v1210, 23673);
+ int16x8_t v1211 = vaddq_s16(v1211_tmp, v1210);
+ int16x8_t v1212 = vaddq_s16(v1209, v1211);
+ int16x8_t v1213 = vqrdmulhq_n_s16(v1212, 20398);
+ int16x8_t v1214 = vaddq_s16(v1208, v1213);
+ int16x8_t v1215 = vqrdmulhq_n_s16(v1214, 17255);
+ int16x8_t v1216 = vaddq_s16(v1204, v1215);
+ int16x8_t v1217 = vqrdmulhq_n_s16(v1216, 16595);
+ int16x8_t v1218 = vaddq_s16(v1194, v1217);
+ int16x8_t v1219 = vsubq_s16(v9, v24);
+ int16x8_t v1220 = vsubq_s16(v42, v58);
+ int16x8_t v1221_tmp = vqrdmulhq_n_s16(v1220, 3314);
+ int16x8_t v1221 = vmlaq_n_s16(v1221_tmp, v1220, 5);
+ int16x8_t v1222 = vaddq_s16(v1219, v1221);
+ int16x8_t v1223 = vsubq_s16(v78, v101);
+ int16x8_t v1224 = vsubq_s16(v119, v136);
+ int16x8_t v1225_tmp = vqrdmulhq_n_s16(v1224, 3314);
+ int16x8_t v1225 = vmlaq_n_s16(v1225_tmp, v1224, 5);
+ int16x8_t v1226 = vaddq_s16(v1223, v1225);
+ int16x8_t v1227 = vqrdmulhq_n_s16(v1226, 22112);
+ int16x8_t v1228 = vaddq_s16(v1222, v1227);
+ int16x8_t v1229 = vsubq_s16(v158, v181);
+ int16x8_t v1230 = vsubq_s16(v213, v231);
+ int16x8_t v1231_tmp = vqrdmulhq_n_s16(v1230, 3314);
+ int16x8_t v1231 = vmlaq_n_s16(v1231_tmp, v1230, 5);
+ int16x8_t v1232 = vaddq_s16(v1229, v1231);
+ int16x8_t v1233 = vsubq_s16(v251, v274);
+ int16x8_t v1234 = vsubq_s16(v292, v310);
+ int16x8_t v1235_tmp = vqrdmulhq_n_s16(v1234, 3314);
+ int16x8_t v1235 = vmlaq_n_s16(v1235_tmp, v1234, 5);
+ int16x8_t v1236 = vaddq_s16(v1233, v1235);
+ int16x8_t v1237 = vqrdmulhq_n_s16(v1236, 22112);
+ int16x8_t v1238 = vaddq_s16(v1232, v1237);
+ int16x8_t v1239 = vqrdmulhq_n_s16(v1238, 17561);
+ int16x8_t v1240 = vaddq_s16(v1228, v1239);
+ int16x8_t v1241 = vsubq_s16(v334, v357);
+ int16x8_t v1242 = vsubq_s16(v389, v407);
+ int16x8_t v1243_tmp = vqrdmulhq_n_s16(v1242, 3314);
+ int16x8_t v1243 = vmlaq_n_s16(v1243_tmp, v1242, 5);
+ int16x8_t v1244 = vaddq_s16(v1241, v1243);
+ int16x8_t v1245 = vsubq_s16(v441, v480);
+ int16x8_t v1246 = vsubq_s16(v498, v517);
+ int16x8_t v1247_tmp = vqrdmulhq_n_s16(v1246, 3314);
+ int16x8_t v1247 = vmlaq_n_s16(v1247_tmp, v1246, 5);
+ int16x8_t v1248 = vaddq_s16(v1245, v1247);
+ int16x8_t v1249 = vqrdmulhq_n_s16(v1248, 22112);
+ int16x8_t v1250 = vaddq_s16(v1244, v1249);
+ int16x8_t v1251 = vsubq_s16(v539, v562);
+ int16x8_t v1252 = vsubq_s16(v594, v612);
+ int16x8_t v1253_tmp = vqrdmulhq_n_s16(v1252, 3314);
+ int16x8_t v1253 = vmlaq_n_s16(v1253_tmp, v1252, 5);
+ int16x8_t v1254 = vaddq_s16(v1251, v1253);
+ int16x8_t v1255 = vsubq_s16(v632, v655);
+ int16x8_t v1256 = vsubq_s16(v673, v692);
+ int16x8_t v1257_tmp = vqrdmulhq_n_s16(v1256, 3314);
+ int16x8_t v1257 = vmlaq_n_s16(v1257_tmp, v1256, 5);
+ int16x8_t v1258 = vaddq_s16(v1255, v1257);
+ int16x8_t v1259 = vqrdmulhq_n_s16(v1258, 22112);
+ int16x8_t v1260 = vaddq_s16(v1254, v1259);
+ int16x8_t v1261 = vqrdmulhq_n_s16(v1260, 17561);
+ int16x8_t v1262 = vaddq_s16(v1250, v1261);
+ int16x8_t v1263 = vqrdmulhq_n_s16(v1262, 16666);
+ int16x8_t v1264 = vaddq_s16(v1240, v1263);
+ int16x8_t v1265 = vsubq_s16(v1219, v1221);
+ int16x8_t v1266 = vsubq_s16(v1223, v1225);
+ int16x8_t v1267 = vqrdmulhq_n_s16(v1266, 24397);
+ int16x8_t v1268 = vaddq_s16(v1265, v1267);
+ int16x8_t v1269 = vsubq_s16(v1229, v1231);
+ int16x8_t v1270 = vsubq_s16(v1233, v1235);
+ int16x8_t v1271 = vqrdmulhq_n_s16(v1270, 24397);
+ int16x8_t v1272 = vaddq_s16(v1269, v1271);
+ int16x8_t v1273 = vqrdmulhq_n_s16(v1272, 17921);
+ int16x8_t v1274 = vaddq_s16(v1268, v1273);
+ int16x8_t v1275 = vsubq_s16(v1241, v1243);
+ int16x8_t v1276 = vsubq_s16(v1245, v1247);
+ int16x8_t v1277 = vqrdmulhq_n_s16(v1276, 24397);
+ int16x8_t v1278 = vaddq_s16(v1275, v1277);
+ int16x8_t v1279 = vsubq_s16(v1251, v1253);
+ int16x8_t v1280 = vsubq_s16(v1255, v1257);
+ int16x8_t v1281 = vqrdmulhq_n_s16(v1280, 24397);
+ int16x8_t v1282 = vaddq_s16(v1279, v1281);
+ int16x8_t v1283 = vqrdmulhq_n_s16(v1282, 17921);
+ int16x8_t v1284 = vaddq_s16(v1278, v1283);
+ int16x8_t v1285 = vqrdmulhq_n_s16(v1284, 16747);
+ int16x8_t v1286 = vaddq_s16(v1274, v1285);
+ int16x8_t v1287 = vsubq_s16(v1173, v1175);
+ int16x8_t v1288 = vsubq_s16(v1177, v1179);
+ int16x8_t v1289 = vqrdmulhq_n_s16(v1288, 27504);
+ int16x8_t v1290 = vaddq_s16(v1287, v1289);
+ int16x8_t v1291 = vsubq_s16(v1183, v1185);
+ int16x8_t v1292 = vsubq_s16(v1187, v1189);
+ int16x8_t v1293 = vqrdmulhq_n_s16(v1292, 27504);
+ int16x8_t v1294 = vaddq_s16(v1291, v1293);
+ int16x8_t v1295 = vqrdmulhq_n_s16(v1294, 18343);
+ int16x8_t v1296 = vaddq_s16(v1290, v1295);
+ int16x8_t v1297 = vsubq_s16(v1195, v1197);
+ int16x8_t v1298 = vsubq_s16(v1199, v1201);
+ int16x8_t v1299 = vqrdmulhq_n_s16(v1298, 27504);
+ int16x8_t v1300 = vaddq_s16(v1297, v1299);
+ int16x8_t v1301 = vsubq_s16(v1205, v1207);
+ int16x8_t v1302 = vsubq_s16(v1209, v1211);
+ int16x8_t v1303 = vqrdmulhq_n_s16(v1302, 27504);
+ int16x8_t v1304 = vaddq_s16(v1301, v1303);
+ int16x8_t v1305 = vqrdmulhq_n_s16(v1304, 18343);
+ int16x8_t v1306 = vaddq_s16(v1300, v1305);
+ int16x8_t v1307 = vqrdmulhq_n_s16(v1306, 16840);
+ int16x8_t v1308 = vaddq_s16(v1296, v1307);
+ int16x8_t v1309 = vsubq_s16(v1127, v1129);
+ int16x8_t v1310 = vsubq_s16(v1131, v1133);
+ int16x8_t v1311 = vqrdmulhq_n_s16(v1310, 31869);
+ int16x8_t v1312 = vaddq_s16(v1309, v1311);
+ int16x8_t v1313 = vsubq_s16(v1137, v1139);
+ int16x8_t v1314 = vsubq_s16(v1141, v1143);
+ int16x8_t v1315 = vqrdmulhq_n_s16(v1314, 31869);
+ int16x8_t v1316 = vaddq_s16(v1313, v1315);
+ int16x8_t v1317 = vqrdmulhq_n_s16(v1316, 18830);
+ int16x8_t v1318 = vaddq_s16(v1312, v1317);
+ int16x8_t v1319 = vsubq_s16(v1149, v1151);
+ int16x8_t v1320 = vsubq_s16(v1153, v1155);
+ int16x8_t v1321 = vqrdmulhq_n_s16(v1320, 31869);
+ int16x8_t v1322 = vaddq_s16(v1319, v1321);
+ int16x8_t v1323 = vsubq_s16(v1159, v1161);
+ int16x8_t v1324 = vsubq_s16(v1163, v1165);
+ int16x8_t v1325 = vqrdmulhq_n_s16(v1324, 31869);
+ int16x8_t v1326 = vaddq_s16(v1323, v1325);
+ int16x8_t v1327 = vqrdmulhq_n_s16(v1326, 18830);
+ int16x8_t v1328 = vaddq_s16(v1322, v1327);
+ int16x8_t v1329 = vqrdmulhq_n_s16(v1328, 16944);
+ int16x8_t v1330 = vaddq_s16(v1318, v1329);
+ int16x8_t v1331 = vsubq_s16(v1081, v1083);
+ int16x8_t v1332 = vsubq_s16(v1085, v1087);
+ int16x8_t v1333_tmp = vqrdmulhq_n_s16(v1332, 5552);
+ int16x8_t v1333 = vaddq_s16(v1333_tmp, v1332);
+ int16x8_t v1334 = vaddq_s16(v1331, v1333);
+ int16x8_t v1335 = vsubq_s16(v1091, v1093);
+ int16x8_t v1336 = vsubq_s16(v1095, v1097);
+ int16x8_t v1337_tmp = vqrdmulhq_n_s16(v1336, 5552);
+ int16x8_t v1337 = vaddq_s16(v1337_tmp, v1336);
+ int16x8_t v1338 = vaddq_s16(v1335, v1337);
+ int16x8_t v1339 = vqrdmulhq_n_s16(v1338, 19393);
+ int16x8_t v1340 = vaddq_s16(v1334, v1339);
+ int16x8_t v1341 = vsubq_s16(v1103, v1105);
+ int16x8_t v1342 = vsubq_s16(v1107, v1109);
+ int16x8_t v1343_tmp = vqrdmulhq_n_s16(v1342, 5552);
+ int16x8_t v1343 = vaddq_s16(v1343_tmp, v1342);
+ int16x8_t v1344 = vaddq_s16(v1341, v1343);
+ int16x8_t v1345 = vsubq_s16(v1113, v1115);
+ int16x8_t v1346 = vsubq_s16(v1117, v1119);
+ int16x8_t v1347_tmp = vqrdmulhq_n_s16(v1346, 5552);
+ int16x8_t v1347 = vaddq_s16(v1347_tmp, v1346);
+ int16x8_t v1348 = vaddq_s16(v1345, v1347);
+ int16x8_t v1349 = vqrdmulhq_n_s16(v1348, 19393);
+ int16x8_t v1350 = vaddq_s16(v1344, v1349);
+ int16x8_t v1351 = vqrdmulhq_n_s16(v1350, 17059);
+ int16x8_t v1352 = vaddq_s16(v1340, v1351);
+ int16x8_t v1353 = vsubq_s16(v990, v995);
+ int16x8_t v1354 = vsubq_s16(v1000, v1005);
+ int16x8_t v1355_tmp = vqrdmulhq_n_s16(v1354, 15865);
+ int16x8_t v1355 = vaddq_s16(v1355_tmp, v1354);
+ int16x8_t v1356 = vaddq_s16(v1353, v1355);
+ int16x8_t v1357 = vsubq_s16(v1012, v1017);
+ int16x8_t v1358 = vsubq_s16(v1022, v1027);
+ int16x8_t v1359_tmp = vqrdmulhq_n_s16(v1358, 15865);
+ int16x8_t v1359 = vaddq_s16(v1359_tmp, v1358);
+ int16x8_t v1360 = vaddq_s16(v1357, v1359);
+ int16x8_t v1361 = vqrdmulhq_n_s16(v1360, 20040);
+ int16x8_t v1362 = vaddq_s16(v1356, v1361);
+ int16x8_t v1363 = vsubq_s16(v1036, v1041);
+ int16x8_t v1364 = vsubq_s16(v1046, v1051);
+ int16x8_t v1365_tmp = vqrdmulhq_n_s16(v1364, 15865);
+ int16x8_t v1365 = vaddq_s16(v1365_tmp, v1364);
+ int16x8_t v1366 = vaddq_s16(v1363, v1365);
+ int16x8_t v1367 = vsubq_s16(v1058, v1063);
+ int16x8_t v1368 = vsubq_s16(v1068, v1073);
+ int16x8_t v1369_tmp = vqrdmulhq_n_s16(v1368, 15865);
+ int16x8_t v1369 = vaddq_s16(v1369_tmp, v1368);
+ int16x8_t v1370 = vaddq_s16(v1367, v1369);
+ int16x8_t v1371 = vqrdmulhq_n_s16(v1370, 20040);
+ int16x8_t v1372 = vaddq_s16(v1366, v1371);
+ int16x8_t v1373 = vqrdmulhq_n_s16(v1372, 17187);
+ int16x8_t v1374 = vaddq_s16(v1362, v1373);
+ int16x8_t v1375 = vsubq_s16(v895, v900);
+ int16x8_t v1376 = vsubq_s16(v905, v910);
+ int16x8_t v1377_tmp = vqrdmulhq_n_s16(v1376, 1893);
+ int16x8_t v1377 = vmlaq_n_s16(v1377_tmp, v1376, 2);
+ int16x8_t v1378 = vaddq_s16(v1375, v1377);
+ int16x8_t v1379 = vsubq_s16(v918, v923);
+ int16x8_t v1380 = vsubq_s16(v928, v933);
+ int16x8_t v1381_tmp = vqrdmulhq_n_s16(v1380, 1893);
+ int16x8_t v1381 = vmlaq_n_s16(v1381_tmp, v1380, 2);
+ int16x8_t v1382 = vaddq_s16(v1379, v1381);
+ int16x8_t v1383 = vqrdmulhq_n_s16(v1382, 20783);
+ int16x8_t v1384 = vaddq_s16(v1378, v1383);
+ int16x8_t v1385 = vsubq_s16(v942, v947);
+ int16x8_t v1386 = vsubq_s16(v952, v957);
+ int16x8_t v1387_tmp = vqrdmulhq_n_s16(v1386, 1893);
+ int16x8_t v1387 = vmlaq_n_s16(v1387_tmp, v1386, 2);
+ int16x8_t v1388 = vaddq_s16(v1385, v1387);
+ int16x8_t v1389 = vsubq_s16(v964, v969);
+ int16x8_t v1390 = vsubq_s16(v974, v979);
+ int16x8_t v1391_tmp = vqrdmulhq_n_s16(v1390, 1893);
+ int16x8_t v1391 = vmlaq_n_s16(v1391_tmp, v1390, 2);
+ int16x8_t v1392 = vaddq_s16(v1389, v1391);
+ int16x8_t v1393 = vqrdmulhq_n_s16(v1392, 20783);
+ int16x8_t v1394 = vaddq_s16(v1388, v1393);
+ int16x8_t v1395 = vqrdmulhq_n_s16(v1394, 17326);
+ int16x8_t v1396 = vaddq_s16(v1384, v1395);
+ int16x8_t v1397 = vsubq_s16(v711, v722);
+ int16x8_t v1398 = vsubq_s16(v733, v744);
+ int16x8_t v1399_tmp = vqrdmulhq_n_s16(v1398, 13357);
+ int16x8_t v1399 = vmlaq_n_s16(v1399_tmp, v1398, 3);
+ int16x8_t v1400 = vaddq_s16(v1397, v1399);
+ int16x8_t v1401 = vsubq_s16(v757, v768);
+ int16x8_t v1402 = vsubq_s16(v779, v790);
+ int16x8_t v1403_tmp = vqrdmulhq_n_s16(v1402, 13357);
+ int16x8_t v1403 = vmlaq_n_s16(v1403_tmp, v1402, 3);
+ int16x8_t v1404 = vaddq_s16(v1401, v1403);
+ int16x8_t v1405 = vqrdmulhq_n_s16(v1404, 21637);
+ int16x8_t v1406 = vaddq_s16(v1400, v1405);
+ int16x8_t v1407 = vsubq_s16(v805, v816);
+ int16x8_t v1408 = vsubq_s16(v827, v838);
+ int16x8_t v1409_tmp = vqrdmulhq_n_s16(v1408, 13357);
+ int16x8_t v1409 = vmlaq_n_s16(v1409_tmp, v1408, 3);
+ int16x8_t v1410 = vaddq_s16(v1407, v1409);
+ int16x8_t v1411 = vsubq_s16(v851, v862);
+ int16x8_t v1412 = vsubq_s16(v873, v884);
+ int16x8_t v1413_tmp = vqrdmulhq_n_s16(v1412, 13357);
+ int16x8_t v1413 = vmlaq_n_s16(v1413_tmp, v1412, 3);
+ int16x8_t v1414 = vaddq_s16(v1411, v1413);
+ int16x8_t v1415 = vqrdmulhq_n_s16(v1414, 21637);
+ int16x8_t v1416 = vaddq_s16(v1410, v1415);
+ int16x8_t v1417 = vqrdmulhq_n_s16(v1416, 17479);
+ int16x8_t v1418 = vaddq_s16(v1406, v1417);
+ int16x8_t v1419 = vsubq_s16(v25, v60);
+ int16x8_t v1420 = vsubq_s16(v102, v138);
+ int16x8_t v1421_tmp = vqrdmulhq_n_s16(v1420, 6226);
+ int16x8_t v1421 = vmlaq_n_s16(v1421_tmp, v1420, 10);
+ int16x8_t v1422 = vaddq_s16(v1419, v1421);
+ int16x8_t v1423 = vsubq_s16(v182, v233);
+ int16x8_t v1424 = vsubq_s16(v275, v312);
+ int16x8_t v1425_tmp = vqrdmulhq_n_s16(v1424, 6226);
+ int16x8_t v1425 = vmlaq_n_s16(v1425_tmp, v1424, 10);
+ int16x8_t v1426 = vaddq_s16(v1423, v1425);
+ int16x8_t v1427 = vqrdmulhq_n_s16(v1426, 22622);
+ int16x8_t v1428 = vaddq_s16(v1422, v1427);
+ int16x8_t v1429 = vsubq_s16(v358, v409);
+ int16x8_t v1430 = vsubq_s16(v481, v519);
+ int16x8_t v1431_tmp = vqrdmulhq_n_s16(v1430, 6226);
+ int16x8_t v1431 = vmlaq_n_s16(v1431_tmp, v1430, 10);
+ int16x8_t v1432 = vaddq_s16(v1429, v1431);
+ int16x8_t v1433 = vsubq_s16(v563, v614);
+ int16x8_t v1434 = vsubq_s16(v656, v694);
+ int16x8_t v1435_tmp = vqrdmulhq_n_s16(v1434, 6226);
+ int16x8_t v1435 = vmlaq_n_s16(v1435_tmp, v1434, 10);
+ int16x8_t v1436 = vaddq_s16(v1433, v1435);
+ int16x8_t v1437 = vqrdmulhq_n_s16(v1436, 22622);
+ int16x8_t v1438 = vaddq_s16(v1432, v1437);
+ int16x8_t v1439 = vqrdmulhq_n_s16(v1438, 17646);
+ int16x8_t v1440 = vaddq_s16(v1428, v1439);
+ int16x8_t v1441 = vsubq_s16(v1419, v1421);
+ int16x8_t v1442 = vsubq_s16(v1423, v1425);
+ int16x8_t v1443 = vqrdmulhq_n_s16(v1442, 23761);
+ int16x8_t v1444 = vaddq_s16(v1441, v1443);
+ int16x8_t v1445 = vsubq_s16(v1429, v1431);
+ int16x8_t v1446 = vsubq_s16(v1433, v1435);
+ int16x8_t v1447 = vqrdmulhq_n_s16(v1446, 23761);
+ int16x8_t v1448 = vaddq_s16(v1445, v1447);
+ int16x8_t v1449 = vqrdmulhq_n_s16(v1448, 17826);
+ int16x8_t v1450 = vaddq_s16(v1444, v1449);
+ int16x8_t v1451 = vsubq_s16(v1397, v1399);
+ int16x8_t v1452 = vsubq_s16(v1401, v1403);
+ int16x8_t v1453 = vqrdmulhq_n_s16(v1452, 25084);
+ int16x8_t v1454 = vaddq_s16(v1451, v1453);
+ int16x8_t v1455 = vsubq_s16(v1407, v1409);
+ int16x8_t v1456 = vsubq_s16(v1411, v1413);
+ int16x8_t v1457 = vqrdmulhq_n_s16(v1456, 25084);
+ int16x8_t v1458 = vaddq_s16(v1455, v1457);
+ int16x8_t v1459 = vqrdmulhq_n_s16(v1458, 18021);
+ int16x8_t v1460 = vaddq_s16(v1454, v1459);
+ int16x8_t v1461 = vsubq_s16(v1375, v1377);
+ int16x8_t v1462 = vsubq_s16(v1379, v1381);
+ int16x8_t v1463 = vqrdmulhq_n_s16(v1462, 26631);
+ int16x8_t v1464 = vaddq_s16(v1461, v1463);
+ int16x8_t v1465 = vsubq_s16(v1385, v1387);
+ int16x8_t v1466 = vsubq_s16(v1389, v1391);
+ int16x8_t v1467 = vqrdmulhq_n_s16(v1466, 26631);
+ int16x8_t v1468 = vaddq_s16(v1465, v1467);
+ int16x8_t v1469 = vqrdmulhq_n_s16(v1468, 18231);
+ int16x8_t v1470 = vaddq_s16(v1464, v1469);
+ int16x8_t v1471 = vsubq_s16(v1353, v1355);
+ int16x8_t v1472 = vsubq_s16(v1357, v1359);
+ int16x8_t v1473 = vqrdmulhq_n_s16(v1472, 28454);
+ int16x8_t v1474 = vaddq_s16(v1471, v1473);
+ int16x8_t v1475 = vsubq_s16(v1363, v1365);
+ int16x8_t v1476 = vsubq_s16(v1367, v1369);
+ int16x8_t v1477 = vqrdmulhq_n_s16(v1476, 28454);
+ int16x8_t v1478 = vaddq_s16(v1475, v1477);
+ int16x8_t v1479 = vqrdmulhq_n_s16(v1478, 18458);
+ int16x8_t v1480 = vaddq_s16(v1474, v1479);
+ int16x8_t v1481 = vsubq_s16(v1331, v1333);
+ int16x8_t v1482 = vsubq_s16(v1335, v1337);
+ int16x8_t v1483 = vqrdmulhq_n_s16(v1482, 30624);
+ int16x8_t v1484 = vaddq_s16(v1481, v1483);
+ int16x8_t v1485 = vsubq_s16(v1341, v1343);
+ int16x8_t v1486 = vsubq_s16(v1345, v1347);
+ int16x8_t v1487 = vqrdmulhq_n_s16(v1486, 30624);
+ int16x8_t v1488 = vaddq_s16(v1485, v1487);
+ int16x8_t v1489 = vqrdmulhq_n_s16(v1488, 18702);
+ int16x8_t v1490 = vaddq_s16(v1484, v1489);
+ int16x8_t v1491 = vsubq_s16(v1309, v1311);
+ int16x8_t v1492 = vsubq_s16(v1313, v1315);
+ int16x8_t v1493_tmp = vqrdmulhq_n_s16(v1492, 472);
+ int16x8_t v1493 = vaddq_s16(v1493_tmp, v1492);
+ int16x8_t v1494 = vaddq_s16(v1491, v1493);
+ int16x8_t v1495 = vsubq_s16(v1319, v1321);
+ int16x8_t v1496 = vsubq_s16(v1323, v1325);
+ int16x8_t v1497_tmp = vqrdmulhq_n_s16(v1496, 472);
+ int16x8_t v1497 = vaddq_s16(v1497_tmp, v1496);
+ int16x8_t v1498 = vaddq_s16(v1495, v1497);
+ int16x8_t v1499 = vqrdmulhq_n_s16(v1498, 18964);
+ int16x8_t v1500 = vaddq_s16(v1494, v1499);
+ int16x8_t v1501 = vsubq_s16(v1287, v1289);
+ int16x8_t v1502 = vsubq_s16(v1291, v1293);
+ int16x8_t v1503_tmp = vqrdmulhq_n_s16(v1502, 3672);
+ int16x8_t v1503 = vaddq_s16(v1503_tmp, v1502);
+ int16x8_t v1504 = vaddq_s16(v1501, v1503);
+ int16x8_t v1505 = vsubq_s16(v1297, v1299);
+ int16x8_t v1506 = vsubq_s16(v1301, v1303);
+ int16x8_t v1507_tmp = vqrdmulhq_n_s16(v1506, 3672);
+ int16x8_t v1507 = vaddq_s16(v1507_tmp, v1506);
+ int16x8_t v1508 = vaddq_s16(v1505, v1507);
+ int16x8_t v1509 = vqrdmulhq_n_s16(v1508, 19245);
+ int16x8_t v1510 = vaddq_s16(v1504, v1509);
+ int16x8_t v1511 = vsubq_s16(v1265, v1267);
+ int16x8_t v1512 = vsubq_s16(v1269, v1271);
+ int16x8_t v1513_tmp = vqrdmulhq_n_s16(v1512, 7662);
+ int16x8_t v1513 = vaddq_s16(v1513_tmp, v1512);
+ int16x8_t v1514 = vaddq_s16(v1511, v1513);
+ int16x8_t v1515 = vsubq_s16(v1275, v1277);
+ int16x8_t v1516 = vsubq_s16(v1279, v1281);
+ int16x8_t v1517_tmp = vqrdmulhq_n_s16(v1516, 7662);
+ int16x8_t v1517 = vaddq_s16(v1517_tmp, v1516);
+ int16x8_t v1518 = vaddq_s16(v1515, v1517);
+ int16x8_t v1519 = vqrdmulhq_n_s16(v1518, 19546);
+ int16x8_t v1520 = vaddq_s16(v1514, v1519);
+ int16x8_t v1521 = vsubq_s16(v1222, v1227);
+ int16x8_t v1522 = vsubq_s16(v1232, v1237);
+ int16x8_t v1523_tmp = vqrdmulhq_n_s16(v1522, 12756);
+ int16x8_t v1523 = vaddq_s16(v1523_tmp, v1522);
+ int16x8_t v1524 = vaddq_s16(v1521, v1523);
+ int16x8_t v1525 = vsubq_s16(v1244, v1249);
+ int16x8_t v1526 = vsubq_s16(v1254, v1259);
+ int16x8_t v1527_tmp = vqrdmulhq_n_s16(v1526, 12756);
+ int16x8_t v1527 = vaddq_s16(v1527_tmp, v1526);
+ int16x8_t v1528 = vaddq_s16(v1525, v1527);
+ int16x8_t v1529 = vqrdmulhq_n_s16(v1528, 19869);
+ int16x8_t v1530 = vaddq_s16(v1524, v1529);
+ int16x8_t v1531 = vsubq_s16(v1176, v1181);
+ int16x8_t v1532 = vsubq_s16(v1186, v1191);
+ int16x8_t v1533_tmp = vqrdmulhq_n_s16(v1532, 19463);
+ int16x8_t v1533 = vaddq_s16(v1533_tmp, v1532);
+ int16x8_t v1534 = vaddq_s16(v1531, v1533);
+ int16x8_t v1535 = vsubq_s16(v1198, v1203);
+ int16x8_t v1536 = vsubq_s16(v1208, v1213);
+ int16x8_t v1537_tmp = vqrdmulhq_n_s16(v1536, 19463);
+ int16x8_t v1537 = vaddq_s16(v1537_tmp, v1536);
+ int16x8_t v1538 = vaddq_s16(v1535, v1537);
+ int16x8_t v1539 = vqrdmulhq_n_s16(v1538, 20216);
+ int16x8_t v1540 = vaddq_s16(v1534, v1539);
+ int16x8_t v1541 = vsubq_s16(v1130, v1135);
+ int16x8_t v1542 = vsubq_s16(v1140, v1145);
+ int16x8_t v1543_tmp = vqrdmulhq_n_s16(v1542, 28661);
+ int16x8_t v1543 = vaddq_s16(v1543_tmp, v1542);
+ int16x8_t v1544 = vaddq_s16(v1541, v1543);
+ int16x8_t v1545 = vsubq_s16(v1152, v1157);
+ int16x8_t v1546 = vsubq_s16(v1162, v1167);
+ int16x8_t v1547_tmp = vqrdmulhq_n_s16(v1546, 28661);
+ int16x8_t v1547 = vaddq_s16(v1547_tmp, v1546);
+ int16x8_t v1548 = vaddq_s16(v1545, v1547);
+ int16x8_t v1549 = vqrdmulhq_n_s16(v1548, 20587);
+ int16x8_t v1550 = vaddq_s16(v1544, v1549);
+ int16x8_t v1551 = vsubq_s16(v1084, v1089);
+ int16x8_t v1552 = vsubq_s16(v1094, v1099);
+ int16x8_t v1553_tmp = vqrdmulhq_n_s16(v1552, 9242);
+ int16x8_t v1553 = vmlaq_n_s16(v1553_tmp, v1552, 2);
+ int16x8_t v1554 = vaddq_s16(v1551, v1553);
+ int16x8_t v1555 = vsubq_s16(v1106, v1111);
+ int16x8_t v1556 = vsubq_s16(v1116, v1121);
+ int16x8_t v1557_tmp = vqrdmulhq_n_s16(v1556, 9242);
+ int16x8_t v1557 = vmlaq_n_s16(v1557_tmp, v1556, 2);
+ int16x8_t v1558 = vaddq_s16(v1555, v1557);
+ int16x8_t v1559 = vqrdmulhq_n_s16(v1558, 20985);
+ int16x8_t v1560 = vaddq_s16(v1554, v1559);
+ int16x8_t v1561 = vsubq_s16(v996, v1007);
+ int16x8_t v1562 = vsubq_s16(v1018, v1029);
+ int16x8_t v1563_tmp = vqrdmulhq_n_s16(v1562, 30298);
+ int16x8_t v1563 = vmlaq_n_s16(v1563_tmp, v1562, 2);
+ int16x8_t v1564 = vaddq_s16(v1561, v1563);
+ int16x8_t v1565 = vsubq_s16(v1042, v1053);
+ int16x8_t v1566 = vsubq_s16(v1064, v1075);
+ int16x8_t v1567_tmp = vqrdmulhq_n_s16(v1566, 30298);
+ int16x8_t v1567 = vmlaq_n_s16(v1567_tmp, v1566, 2);
+ int16x8_t v1568 = vaddq_s16(v1565, v1567);
+ int16x8_t v1569 = vqrdmulhq_n_s16(v1568, 21412);
+ int16x8_t v1570 = vaddq_s16(v1564, v1569);
+ int16x8_t v1571 = vsubq_s16(v901, v912);
+ int16x8_t v1572 = vsubq_s16(v924, v935);
+ int16x8_t v1573_tmp = vqrdmulhq_n_s16(v1572, 2773);
+ int16x8_t v1573 = vmlaq_n_s16(v1573_tmp, v1572, 4);
+ int16x8_t v1574 = vaddq_s16(v1571, v1573);
+ int16x8_t v1575 = vsubq_s16(v948, v959);
+ int16x8_t v1576 = vsubq_s16(v970, v981);
+ int16x8_t v1577_tmp = vqrdmulhq_n_s16(v1576, 2773);
+ int16x8_t v1577 = vmlaq_n_s16(v1577_tmp, v1576, 4);
+ int16x8_t v1578 = vaddq_s16(v1575, v1577);
+ int16x8_t v1579 = vqrdmulhq_n_s16(v1578, 21871);
+ int16x8_t v1580 = vaddq_s16(v1574, v1579);
+ int16x8_t v1581 = vsubq_s16(v723, v746);
+ int16x8_t v1582 = vsubq_s16(v769, v792);
+ int16x8_t v1583_tmp = vqrdmulhq_n_s16(v1582, 26108);
+ int16x8_t v1583 = vmlaq_n_s16(v1583_tmp, v1582, 6);
+ int16x8_t v1584 = vaddq_s16(v1581, v1583);
+ int16x8_t v1585 = vsubq_s16(v817, v840);
+ int16x8_t v1586 = vsubq_s16(v863, v886);
+ int16x8_t v1587_tmp = vqrdmulhq_n_s16(v1586, 26108);
+ int16x8_t v1587 = vmlaq_n_s16(v1587_tmp, v1586, 6);
+ int16x8_t v1588 = vaddq_s16(v1585, v1587);
+ int16x8_t v1589 = vqrdmulhq_n_s16(v1588, 22363);
+ int16x8_t v1590 = vaddq_s16(v1584, v1589);
+ int16x8_t v1591 = vsubq_s16(v61, v140);
+ int16x8_t v1592 = vsubq_s16(v234, v314);
+ int16x8_t v1593_tmp = vqrdmulhq_n_s16(v1592, 12251);
+ int16x8_t v1593 = vmlaq_n_s16(v1593_tmp, v1592, 20);
+ int16x8_t v1594 = vaddq_s16(v1591, v1593);
+ int16x8_t v1595 = vsubq_s16(v410, v521);
+ int16x8_t v1596 = vsubq_s16(v615, v696);
+ int16x8_t v1597_tmp = vqrdmulhq_n_s16(v1596, 12251);
+ int16x8_t v1597 = vmlaq_n_s16(v1597_tmp, v1596, 20);
+ int16x8_t v1598 = vaddq_s16(v1595, v1597);
+ int16x8_t v1599 = vqrdmulhq_n_s16(v1598, 22891);
+ int16x8_t v1600 = vaddq_s16(v1594, v1599);
+ int16x8_t v1601 = vsubq_s16(v1591, v1593);
+ int16x8_t v1602 = vsubq_s16(v1595, v1597);
+ int16x8_t v1603 = vqrdmulhq_n_s16(v1602, 23460);
+ int16x8_t v1604 = vaddq_s16(v1601, v1603);
+ int16x8_t v1605 = vsubq_s16(v1581, v1583);
+ int16x8_t v1606 = vsubq_s16(v1585, v1587);
+ int16x8_t v1607 = vqrdmulhq_n_s16(v1606, 24073);
+ int16x8_t v1608 = vaddq_s16(v1605, v1607);
+ int16x8_t v1609 = vsubq_s16(v1571, v1573);
+ int16x8_t v1610 = vsubq_s16(v1575, v1577);
+ int16x8_t v1611 = vqrdmulhq_n_s16(v1610, 24734);
+ int16x8_t v1612 = vaddq_s16(v1609, v1611);
+ int16x8_t v1613 = vsubq_s16(v1561, v1563);
+ int16x8_t v1614 = vsubq_s16(v1565, v1567);
+ int16x8_t v1615 = vqrdmulhq_n_s16(v1614, 25448);
+ int16x8_t v1616 = vaddq_s16(v1613, v1615);
+ int16x8_t v1617 = vsubq_s16(v1551, v1553);
+ int16x8_t v1618 = vsubq_s16(v1555, v1557);
+ int16x8_t v1619 = vqrdmulhq_n_s16(v1618, 26220);
+ int16x8_t v1620 = vaddq_s16(v1617, v1619);
+ int16x8_t v1621 = vsubq_s16(v1541, v1543);
+ int16x8_t v1622 = vsubq_s16(v1545, v1547);
+ int16x8_t v1623 = vqrdmulhq_n_s16(v1622, 27058);
+ int16x8_t v1624 = vaddq_s16(v1621, v1623);
+ int16x8_t v1625 = vsubq_s16(v1531, v1533);
+ int16x8_t v1626 = vsubq_s16(v1535, v1537);
+ int16x8_t v1627 = vqrdmulhq_n_s16(v1626, 27969);
+ int16x8_t v1628 = vaddq_s16(v1625, v1627);
+ int16x8_t v1629 = vsubq_s16(v1521, v1523);
+ int16x8_t v1630 = vsubq_s16(v1525, v1527);
+ int16x8_t v1631 = vqrdmulhq_n_s16(v1630, 28961);
+ int16x8_t v1632 = vaddq_s16(v1629, v1631);
+ int16x8_t v1633 = vsubq_s16(v1511, v1513);
+ int16x8_t v1634 = vsubq_s16(v1515, v1517);
+ int16x8_t v1635 = vqrdmulhq_n_s16(v1634, 30044);
+ int16x8_t v1636 = vaddq_s16(v1633, v1635);
+ int16x8_t v1637 = vsubq_s16(v1501, v1503);
+ int16x8_t v1638 = vsubq_s16(v1505, v1507);
+ int16x8_t v1639 = vqrdmulhq_n_s16(v1638, 31232);
+ int16x8_t v1640 = vaddq_s16(v1637, v1639);
+ int16x8_t v1641 = vsubq_s16(v1491, v1493);
+ int16x8_t v1642 = vsubq_s16(v1495, v1497);
+ int16x8_t v1643 = vqrdmulhq_n_s16(v1642, 32538);
+ int16x8_t v1644 = vaddq_s16(v1641, v1643);
+ int16x8_t v1645 = vsubq_s16(v1481, v1483);
+ int16x8_t v1646 = vsubq_s16(v1485, v1487);
+ int16x8_t v1647_tmp = vqrdmulhq_n_s16(v1646, 1211);
+ int16x8_t v1647 = vaddq_s16(v1647_tmp, v1646);
+ int16x8_t v1648 = vaddq_s16(v1645, v1647);
+ int16x8_t v1649 = vsubq_s16(v1471, v1473);
+ int16x8_t v1650 = vsubq_s16(v1475, v1477);
+ int16x8_t v1651_tmp = vqrdmulhq_n_s16(v1650, 2808);
+ int16x8_t v1651 = vaddq_s16(v1651_tmp, v1650);
+ int16x8_t v1652 = vaddq_s16(v1649, v1651);
+ int16x8_t v1653 = vsubq_s16(v1461, v1463);
+ int16x8_t v1654 = vsubq_s16(v1465, v1467);
+ int16x8_t v1655_tmp = vqrdmulhq_n_s16(v1654, 4586);
+ int16x8_t v1655 = vaddq_s16(v1655_tmp, v1654);
+ int16x8_t v1656 = vaddq_s16(v1653, v1655);
+ int16x8_t v1657 = vsubq_s16(v1451, v1453);
+ int16x8_t v1658 = vsubq_s16(v1455, v1457);
+ int16x8_t v1659_tmp = vqrdmulhq_n_s16(v1658, 6576);
+ int16x8_t v1659 = vaddq_s16(v1659_tmp, v1658);
+ int16x8_t v1660 = vaddq_s16(v1657, v1659);
+ int16x8_t v1661 = vsubq_s16(v1441, v1443);
+ int16x8_t v1662 = vsubq_s16(v1445, v1447);
+ int16x8_t v1663_tmp = vqrdmulhq_n_s16(v1662, 8817);
+ int16x8_t v1663 = vaddq_s16(v1663_tmp, v1662);
+ int16x8_t v1664 = vaddq_s16(v1661, v1663);
+ int16x8_t v1665 = vsubq_s16(v1422, v1427);
+ int16x8_t v1666 = vsubq_s16(v1432, v1437);
+ int16x8_t v1667_tmp = vqrdmulhq_n_s16(v1666, 11356);
+ int16x8_t v1667 = vaddq_s16(v1667_tmp, v1666);
+ int16x8_t v1668 = vaddq_s16(v1665, v1667);
+ int16x8_t v1669 = vsubq_s16(v1400, v1405);
+ int16x8_t v1670 = vsubq_s16(v1410, v1415);
+ int16x8_t v1671_tmp = vqrdmulhq_n_s16(v1670, 14256);
+ int16x8_t v1671 = vaddq_s16(v1671_tmp, v1670);
+ int16x8_t v1672 = vaddq_s16(v1669, v1671);
+ int16x8_t v1673 = vsubq_s16(v1378, v1383);
+ int16x8_t v1674 = vsubq_s16(v1388, v1393);
+ int16x8_t v1675_tmp = vqrdmulhq_n_s16(v1674, 17596);
+ int16x8_t v1675 = vaddq_s16(v1675_tmp, v1674);
+ int16x8_t v1676 = vaddq_s16(v1673, v1675);
+ int16x8_t v1677 = vsubq_s16(v1356, v1361);
+ int16x8_t v1678 = vsubq_s16(v1366, v1371);
+ int16x8_t v1679_tmp = vqrdmulhq_n_s16(v1678, 21483);
+ int16x8_t v1679 = vaddq_s16(v1679_tmp, v1678);
+ int16x8_t v1680 = vaddq_s16(v1677, v1679);
+ int16x8_t v1681 = vsubq_s16(v1334, v1339);
+ int16x8_t v1682 = vsubq_s16(v1344, v1349);
+ int16x8_t v1683_tmp = vqrdmulhq_n_s16(v1682, 26057);
+ int16x8_t v1683 = vaddq_s16(v1683_tmp, v1682);
+ int16x8_t v1684 = vaddq_s16(v1681, v1683);
+ int16x8_t v1685 = vsubq_s16(v1312, v1317);
+ int16x8_t v1686 = vsubq_s16(v1322, v1327);
+ int16x8_t v1687_tmp = vqrdmulhq_n_s16(v1686, 31517);
+ int16x8_t v1687 = vaddq_s16(v1687_tmp, v1686);
+ int16x8_t v1688 = vaddq_s16(v1685, v1687);
+ int16x8_t v1689 = vsubq_s16(v1290, v1295);
+ int16x8_t v1690 = vsubq_s16(v1300, v1305);
+ int16x8_t v1691_tmp = vqrdmulhq_n_s16(v1690, 5373);
+ int16x8_t v1691 = vmlaq_n_s16(v1691_tmp, v1690, 2);
+ int16x8_t v1692 = vaddq_s16(v1689, v1691);
+ int16x8_t v1693 = vsubq_s16(v1268, v1273);
+ int16x8_t v1694 = vsubq_s16(v1278, v1283);
+ int16x8_t v1695_tmp = vqrdmulhq_n_s16(v1694, 13571);
+ int16x8_t v1695 = vmlaq_n_s16(v1695_tmp, v1694, 2);
+ int16x8_t v1696 = vaddq_s16(v1693, v1695);
+ int16x8_t v1697 = vsubq_s16(v1228, v1239);
+ int16x8_t v1698 = vsubq_s16(v1250, v1261);
+ int16x8_t v1699_tmp = vqrdmulhq_n_s16(v1698, 23975);
+ int16x8_t v1699 = vmlaq_n_s16(v1699_tmp, v1698, 2);
+ int16x8_t v1700 = vaddq_s16(v1697, v1699);
+ int16x8_t v1701 = vsubq_s16(v1182, v1193);
+ int16x8_t v1702 = vsubq_s16(v1204, v1215);
+ int16x8_t v1703_tmp = vqrdmulhq_n_s16(v1702, 4832);
+ int16x8_t v1703 = vmlaq_n_s16(v1703_tmp, v1702, 3);
+ int16x8_t v1704 = vaddq_s16(v1701, v1703);
+ int16x8_t v1705 = vsubq_s16(v1136, v1147);
+ int16x8_t v1706 = vsubq_s16(v1158, v1169);
+ int16x8_t v1707_tmp = vqrdmulhq_n_s16(v1706, 23437);
+ int16x8_t v1707 = vmlaq_n_s16(v1707_tmp, v1706, 3);
+ int16x8_t v1708 = vaddq_s16(v1705, v1707);
+ int16x8_t v1709 = vsubq_s16(v1090, v1101);
+ int16x8_t v1710 = vsubq_s16(v1112, v1123);
+ int16x8_t v1711_tmp = vqrdmulhq_n_s16(v1710, 17573);
+ int16x8_t v1711 = vmlaq_n_s16(v1711_tmp, v1710, 4);
+ int16x8_t v1712 = vaddq_s16(v1709, v1711);
+ int16x8_t v1713 = vsubq_s16(v1008, v1031);
+ int16x8_t v1714 = vsubq_s16(v1054, v1077);
+ int16x8_t v1715_tmp = vqrdmulhq_n_s16(v1714, 27122);
+ int16x8_t v1715 = vmlaq_n_s16(v1715_tmp, v1714, 5);
+ int16x8_t v1716 = vaddq_s16(v1713, v1715);
+ int16x8_t v1717 = vsubq_s16(v913, v937);
+ int16x8_t v1718 = vsubq_s16(v960, v983);
+ int16x8_t v1719_tmp = vqrdmulhq_n_s16(v1718, 5041);
+ int16x8_t v1719 = vmlaq_n_s16(v1719_tmp, v1718, 8);
+ int16x8_t v1720 = vaddq_s16(v1717, v1719);
+ int16x8_t v1721 = vsubq_s16(v747, v794);
+ int16x8_t v1722 = vsubq_s16(v841, v888);
+ int16x8_t v1723_tmp = vqrdmulhq_n_s16(v1722, 19146);
+ int16x8_t v1723 = vmlaq_n_s16(v1723_tmp, v1722, 13);
+ int16x8_t v1724 = vaddq_s16(v1721, v1723);
+ int16x8_t v1725 = vsubq_s16(v141, v316);
+ int16x8_t v1726 = vsubq_s16(v522, v698);
+ int16x8_t v1727_tmp = vqrdmulhq_n_s16(v1726, 24402);
+ int16x8_t v1727 = vmlaq_n_s16(v1727_tmp, v1726, 40);
+ int16x8_t v1728 = vaddq_s16(v1725, v1727);
+ int16x8_t v1729 = vsubq_s16(v1725, v1727);
+ int16x8_t v1730 = vsubq_s16(v1721, v1723);
+ int16x8_t v1731 = vsubq_s16(v1717, v1719);
+ int16x8_t v1732 = vsubq_s16(v1713, v1715);
+ int16x8_t v1733 = vsubq_s16(v1709, v1711);
+ int16x8_t v1734 = vsubq_s16(v1705, v1707);
+ int16x8_t v1735 = vsubq_s16(v1701, v1703);
+ int16x8_t v1736 = vsubq_s16(v1697, v1699);
+ int16x8_t v1737 = vsubq_s16(v1693, v1695);
+ int16x8_t v1738 = vsubq_s16(v1689, v1691);
+ int16x8_t v1739 = vsubq_s16(v1685, v1687);
+ int16x8_t v1740 = vsubq_s16(v1681, v1683);
+ int16x8_t v1741 = vsubq_s16(v1677, v1679);
+ int16x8_t v1742 = vsubq_s16(v1673, v1675);
+ int16x8_t v1743 = vsubq_s16(v1669, v1671);
+ int16x8_t v1744 = vsubq_s16(v1665, v1667);
+ int16x8_t v1745 = vsubq_s16(v1661, v1663);
+ int16x8_t v1746 = vsubq_s16(v1657, v1659);
+ int16x8_t v1747 = vsubq_s16(v1653, v1655);
+ int16x8_t v1748 = vsubq_s16(v1649, v1651);
+ int16x8_t v1749 = vsubq_s16(v1645, v1647);
+ int16x8_t v1750 = vsubq_s16(v1641, v1643);
+ int16x8_t v1751 = vsubq_s16(v1637, v1639);
+ int16x8_t v1752 = vsubq_s16(v1633, v1635);
+ int16x8_t v1753 = vsubq_s16(v1629, v1631);
+ int16x8_t v1754 = vsubq_s16(v1625, v1627);
+ int16x8_t v1755 = vsubq_s16(v1621, v1623);
+ int16x8_t v1756 = vsubq_s16(v1617, v1619);
+ int16x8_t v1757 = vsubq_s16(v1613, v1615);
+ int16x8_t v1758 = vsubq_s16(v1609, v1611);
+ int16x8_t v1759 = vsubq_s16(v1605, v1607);
+ int16x8_t v1760 = vsubq_s16(v1601, v1603);
+ int16x8_t v1761 = vsubq_s16(v1594, v1599);
+ int16x8_t v1762 = vsubq_s16(v1584, v1589);
+ int16x8_t v1763 = vsubq_s16(v1574, v1579);
+ int16x8_t v1764 = vsubq_s16(v1564, v1569);
+ int16x8_t v1765 = vsubq_s16(v1554, v1559);
+ int16x8_t v1766 = vsubq_s16(v1544, v1549);
+ int16x8_t v1767 = vsubq_s16(v1534, v1539);
+ int16x8_t v1768 = vsubq_s16(v1524, v1529);
+ int16x8_t v1769 = vsubq_s16(v1514, v1519);
+ int16x8_t v1770 = vsubq_s16(v1504, v1509);
+ int16x8_t v1771 = vsubq_s16(v1494, v1499);
+ int16x8_t v1772 = vsubq_s16(v1484, v1489);
+ int16x8_t v1773 = vsubq_s16(v1474, v1479);
+ int16x8_t v1774 = vsubq_s16(v1464, v1469);
+ int16x8_t v1775 = vsubq_s16(v1454, v1459);
+ int16x8_t v1776 = vsubq_s16(v1444, v1449);
+ int16x8_t v1777 = vsubq_s16(v1428, v1439);
+ int16x8_t v1778 = vsubq_s16(v1406, v1417);
+ int16x8_t v1779 = vsubq_s16(v1384, v1395);
+ int16x8_t v1780 = vsubq_s16(v1362, v1373);
+ int16x8_t v1781 = vsubq_s16(v1340, v1351);
+ int16x8_t v1782 = vsubq_s16(v1318, v1329);
+ int16x8_t v1783 = vsubq_s16(v1296, v1307);
+ int16x8_t v1784 = vsubq_s16(v1274, v1285);
+ int16x8_t v1785 = vsubq_s16(v1240, v1263);
+ int16x8_t v1786 = vsubq_s16(v1194, v1217);
+ int16x8_t v1787 = vsubq_s16(v1148, v1171);
+ int16x8_t v1788 = vsubq_s16(v1102, v1125);
+ int16x8_t v1789 = vsubq_s16(v1032, v1079);
+ int16x8_t v1790 = vsubq_s16(v938, v985);
+ int16x8_t v1791 = vsubq_s16(v795, v890);
+ int16x8_t v1792 = vsubq_s16(v317, v700);
+ vst1q_s16(out + out_stride * 0 + i, v701);
+ vst1q_s16(out + out_stride * 1 + i, v891);
+ vst1q_s16(out + out_stride * 2 + i, v986);
+ vst1q_s16(out + out_stride * 3 + i, v1080);
+ vst1q_s16(out + out_stride * 4 + i, v1126);
+ vst1q_s16(out + out_stride * 5 + i, v1172);
+ vst1q_s16(out + out_stride * 6 + i, v1218);
+ vst1q_s16(out + out_stride * 7 + i, v1264);
+ vst1q_s16(out + out_stride * 8 + i, v1286);
+ vst1q_s16(out + out_stride * 9 + i, v1308);
+ vst1q_s16(out + out_stride * 10 + i, v1330);
+ vst1q_s16(out + out_stride * 11 + i, v1352);
+ vst1q_s16(out + out_stride * 12 + i, v1374);
+ vst1q_s16(out + out_stride * 13 + i, v1396);
+ vst1q_s16(out + out_stride * 14 + i, v1418);
+ vst1q_s16(out + out_stride * 15 + i, v1440);
+ vst1q_s16(out + out_stride * 16 + i, v1450);
+ vst1q_s16(out + out_stride * 17 + i, v1460);
+ vst1q_s16(out + out_stride * 18 + i, v1470);
+ vst1q_s16(out + out_stride * 19 + i, v1480);
+ vst1q_s16(out + out_stride * 20 + i, v1490);
+ vst1q_s16(out + out_stride * 21 + i, v1500);
+ vst1q_s16(out + out_stride * 22 + i, v1510);
+ vst1q_s16(out + out_stride * 23 + i, v1520);
+ vst1q_s16(out + out_stride * 24 + i, v1530);
+ vst1q_s16(out + out_stride * 25 + i, v1540);
+ vst1q_s16(out + out_stride * 26 + i, v1550);
+ vst1q_s16(out + out_stride * 27 + i, v1560);
+ vst1q_s16(out + out_stride * 28 + i, v1570);
+ vst1q_s16(out + out_stride * 29 + i, v1580);
+ vst1q_s16(out + out_stride * 30 + i, v1590);
+ vst1q_s16(out + out_stride * 31 + i, v1600);
+ vst1q_s16(out + out_stride * 32 + i, v1604);
+ vst1q_s16(out + out_stride * 33 + i, v1608);
+ vst1q_s16(out + out_stride * 34 + i, v1612);
+ vst1q_s16(out + out_stride * 35 + i, v1616);
+ vst1q_s16(out + out_stride * 36 + i, v1620);
+ vst1q_s16(out + out_stride * 37 + i, v1624);
+ vst1q_s16(out + out_stride * 38 + i, v1628);
+ vst1q_s16(out + out_stride * 39 + i, v1632);
+ vst1q_s16(out + out_stride * 40 + i, v1636);
+ vst1q_s16(out + out_stride * 41 + i, v1640);
+ vst1q_s16(out + out_stride * 42 + i, v1644);
+ vst1q_s16(out + out_stride * 43 + i, v1648);
+ vst1q_s16(out + out_stride * 44 + i, v1652);
+ vst1q_s16(out + out_stride * 45 + i, v1656);
+ vst1q_s16(out + out_stride * 46 + i, v1660);
+ vst1q_s16(out + out_stride * 47 + i, v1664);
+ vst1q_s16(out + out_stride * 48 + i, v1668);
+ vst1q_s16(out + out_stride * 49 + i, v1672);
+ vst1q_s16(out + out_stride * 50 + i, v1676);
+ vst1q_s16(out + out_stride * 51 + i, v1680);
+ vst1q_s16(out + out_stride * 52 + i, v1684);
+ vst1q_s16(out + out_stride * 53 + i, v1688);
+ vst1q_s16(out + out_stride * 54 + i, v1692);
+ vst1q_s16(out + out_stride * 55 + i, v1696);
+ vst1q_s16(out + out_stride * 56 + i, v1700);
+ vst1q_s16(out + out_stride * 57 + i, v1704);
+ vst1q_s16(out + out_stride * 58 + i, v1708);
+ vst1q_s16(out + out_stride * 59 + i, v1712);
+ vst1q_s16(out + out_stride * 60 + i, v1716);
+ vst1q_s16(out + out_stride * 61 + i, v1720);
+ vst1q_s16(out + out_stride * 62 + i, v1724);
+ vst1q_s16(out + out_stride * 63 + i, v1728);
+ vst1q_s16(out + out_stride * 64 + i, v1729);
+ vst1q_s16(out + out_stride * 65 + i, v1730);
+ vst1q_s16(out + out_stride * 66 + i, v1731);
+ vst1q_s16(out + out_stride * 67 + i, v1732);
+ vst1q_s16(out + out_stride * 68 + i, v1733);
+ vst1q_s16(out + out_stride * 69 + i, v1734);
+ vst1q_s16(out + out_stride * 70 + i, v1735);
+ vst1q_s16(out + out_stride * 71 + i, v1736);
+ vst1q_s16(out + out_stride * 72 + i, v1737);
+ vst1q_s16(out + out_stride * 73 + i, v1738);
+ vst1q_s16(out + out_stride * 74 + i, v1739);
+ vst1q_s16(out + out_stride * 75 + i, v1740);
+ vst1q_s16(out + out_stride * 76 + i, v1741);
+ vst1q_s16(out + out_stride * 77 + i, v1742);
+ vst1q_s16(out + out_stride * 78 + i, v1743);
+ vst1q_s16(out + out_stride * 79 + i, v1744);
+ vst1q_s16(out + out_stride * 80 + i, v1745);
+ vst1q_s16(out + out_stride * 81 + i, v1746);
+ vst1q_s16(out + out_stride * 82 + i, v1747);
+ vst1q_s16(out + out_stride * 83 + i, v1748);
+ vst1q_s16(out + out_stride * 84 + i, v1749);
+ vst1q_s16(out + out_stride * 85 + i, v1750);
+ vst1q_s16(out + out_stride * 86 + i, v1751);
+ vst1q_s16(out + out_stride * 87 + i, v1752);
+ vst1q_s16(out + out_stride * 88 + i, v1753);
+ vst1q_s16(out + out_stride * 89 + i, v1754);
+ vst1q_s16(out + out_stride * 90 + i, v1755);
+ vst1q_s16(out + out_stride * 91 + i, v1756);
+ vst1q_s16(out + out_stride * 92 + i, v1757);
+ vst1q_s16(out + out_stride * 93 + i, v1758);
+ vst1q_s16(out + out_stride * 94 + i, v1759);
+ vst1q_s16(out + out_stride * 95 + i, v1760);
+ vst1q_s16(out + out_stride * 96 + i, v1761);
+ vst1q_s16(out + out_stride * 97 + i, v1762);
+ vst1q_s16(out + out_stride * 98 + i, v1763);
+ vst1q_s16(out + out_stride * 99 + i, v1764);
+ vst1q_s16(out + out_stride * 100 + i, v1765);
+ vst1q_s16(out + out_stride * 101 + i, v1766);
+ vst1q_s16(out + out_stride * 102 + i, v1767);
+ vst1q_s16(out + out_stride * 103 + i, v1768);
+ vst1q_s16(out + out_stride * 104 + i, v1769);
+ vst1q_s16(out + out_stride * 105 + i, v1770);
+ vst1q_s16(out + out_stride * 106 + i, v1771);
+ vst1q_s16(out + out_stride * 107 + i, v1772);
+ vst1q_s16(out + out_stride * 108 + i, v1773);
+ vst1q_s16(out + out_stride * 109 + i, v1774);
+ vst1q_s16(out + out_stride * 110 + i, v1775);
+ vst1q_s16(out + out_stride * 111 + i, v1776);
+ vst1q_s16(out + out_stride * 112 + i, v1777);
+ vst1q_s16(out + out_stride * 113 + i, v1778);
+ vst1q_s16(out + out_stride * 114 + i, v1779);
+ vst1q_s16(out + out_stride * 115 + i, v1780);
+ vst1q_s16(out + out_stride * 116 + i, v1781);
+ vst1q_s16(out + out_stride * 117 + i, v1782);
+ vst1q_s16(out + out_stride * 118 + i, v1783);
+ vst1q_s16(out + out_stride * 119 + i, v1784);
+ vst1q_s16(out + out_stride * 120 + i, v1785);
+ vst1q_s16(out + out_stride * 121 + i, v1786);
+ vst1q_s16(out + out_stride * 122 + i, v1787);
+ vst1q_s16(out + out_stride * 123 + i, v1788);
+ vst1q_s16(out + out_stride * 124 + i, v1789);
+ vst1q_s16(out + out_stride * 125 + i, v1790);
+ vst1q_s16(out + out_stride * 126 + i, v1791);
+ vst1q_s16(out + out_stride * 127 + i, v1792);
+ }
+}